blob: 86fd153bcd2b4c558d467dc1442c84907c349353 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
1297 catch e.g. UTF_8. Truncate the string if it is longer than lower_len-1
1298 characters. */
1299static void normalize_encoding(const char *encoding,
1300 char *lower,
1301 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001304 char *l;
1305 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001307 e = encoding;
1308 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001309 l_end = &lower[lower_len - 1];
1310 while (*e && l < l_end) {
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001311 if (ISUPPER(*e)) {
1312 *l++ = TOLOWER(*e++);
1313 }
1314 else if (*e == '_') {
1315 *l++ = '-';
1316 e++;
1317 }
1318 else {
1319 *l++ = *e++;
1320 }
1321 }
1322 *l = '\0';
Victor Stinner600d3be2010-06-10 12:00:55 +00001323}
1324
1325PyObject *PyUnicode_Decode(const char *s,
1326 Py_ssize_t size,
1327 const char *encoding,
1328 const char *errors)
1329{
1330 PyObject *buffer = NULL, *unicode;
1331 Py_buffer info;
1332 char lower[11]; /* Enough for any encoding shortcut */
1333
1334 if (encoding == NULL)
1335 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001336
1337 /* Shortcuts for common default encodings */
Victor Stinner600d3be2010-06-10 12:00:55 +00001338 normalize_encoding(encoding, lower, sizeof(lower));
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001339 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001341 else if ((strcmp(lower, "latin-1") == 0) ||
1342 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001343 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001344#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001345 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001346 return PyUnicode_DecodeMBCS(s, size, errors);
1347#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001348 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001349 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001350 else if (strcmp(lower, "utf-16") == 0)
1351 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1352 else if (strcmp(lower, "utf-32") == 0)
1353 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354
1355 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001356 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001357 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001358 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001359 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 if (buffer == NULL)
1361 goto onError;
1362 unicode = PyCodec_Decode(buffer, encoding, errors);
1363 if (unicode == NULL)
1364 goto onError;
1365 if (!PyUnicode_Check(unicode)) {
1366 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001367 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001368 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 Py_DECREF(unicode);
1370 goto onError;
1371 }
1372 Py_DECREF(buffer);
1373 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 Py_XDECREF(buffer);
1377 return NULL;
1378}
1379
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001380PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1381 const char *encoding,
1382 const char *errors)
1383{
1384 PyObject *v;
1385
1386 if (!PyUnicode_Check(unicode)) {
1387 PyErr_BadArgument();
1388 goto onError;
1389 }
1390
1391 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001392 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001393
1394 /* Decode via the codec registry */
1395 v = PyCodec_Decode(unicode, encoding, errors);
1396 if (v == NULL)
1397 goto onError;
1398 return v;
1399
Benjamin Peterson29060642009-01-31 22:14:21 +00001400 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001401 return NULL;
1402}
1403
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001404PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1405 const char *encoding,
1406 const char *errors)
1407{
1408 PyObject *v;
1409
1410 if (!PyUnicode_Check(unicode)) {
1411 PyErr_BadArgument();
1412 goto onError;
1413 }
1414
1415 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001417
1418 /* Decode via the codec registry */
1419 v = PyCodec_Decode(unicode, encoding, errors);
1420 if (v == NULL)
1421 goto onError;
1422 if (!PyUnicode_Check(v)) {
1423 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001424 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001425 Py_TYPE(v)->tp_name);
1426 Py_DECREF(v);
1427 goto onError;
1428 }
1429 return v;
1430
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 return NULL;
1433}
1434
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 Py_ssize_t size,
1437 const char *encoding,
1438 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439{
1440 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001441
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442 unicode = PyUnicode_FromUnicode(s, size);
1443 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1446 Py_DECREF(unicode);
1447 return v;
1448}
1449
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001450PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1451 const char *encoding,
1452 const char *errors)
1453{
1454 PyObject *v;
1455
1456 if (!PyUnicode_Check(unicode)) {
1457 PyErr_BadArgument();
1458 goto onError;
1459 }
1460
1461 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001462 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001463
1464 /* Encode via the codec registry */
1465 v = PyCodec_Encode(unicode, encoding, errors);
1466 if (v == NULL)
1467 goto onError;
1468 return v;
1469
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001471 return NULL;
1472}
1473
Victor Stinnerae6265f2010-05-15 16:27:27 +00001474PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1475{
1476 if (Py_FileSystemDefaultEncoding)
1477 return PyUnicode_AsEncodedString(unicode,
1478 Py_FileSystemDefaultEncoding,
1479 "surrogateescape");
1480 else
1481 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1482 PyUnicode_GET_SIZE(unicode),
1483 "surrogateescape");
1484}
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1487 const char *encoding,
1488 const char *errors)
1489{
1490 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001491 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001492
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 if (!PyUnicode_Check(unicode)) {
1494 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
Fred Drakee4315f52000-05-09 19:53:39 +00001497
Tim Petersced69f82003-09-16 20:30:58 +00001498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001500
1501 /* Shortcuts for common default encodings */
Victor Stinner600d3be2010-06-10 12:00:55 +00001502 normalize_encoding(encoding, lower, sizeof(lower));
1503 if (strcmp(lower, "utf-8") == 0)
Victor Stinner59e62db2010-05-15 13:14:32 +00001504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 errors);
Victor Stinner600d3be2010-06-10 12:00:55 +00001507 else if ((strcmp(lower, "latin-1") == 0) ||
1508 (strcmp(lower, "iso-8859-1") == 0))
Victor Stinner59e62db2010-05-15 13:14:32 +00001509 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner600d3be2010-06-10 12:00:55 +00001513 else if (strcmp(lower, "mbcs") == 0)
Victor Stinner59e62db2010-05-15 13:14:32 +00001514 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001517#endif
Victor Stinner600d3be2010-06-10 12:00:55 +00001518 else if (strcmp(lower, "ascii") == 0)
Victor Stinner59e62db2010-05-15 13:14:32 +00001519 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1520 PyUnicode_GET_SIZE(unicode),
1521 errors);
1522 /* During bootstrap, we may need to find the encodings
1523 package, to load the file system encoding, and require the
1524 file system encoding in order to load the encodings
1525 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001526
Victor Stinner59e62db2010-05-15 13:14:32 +00001527 Break out of this dependency by assuming that the path to
1528 the encodings module is ASCII-only. XXX could try wcstombs
1529 instead, if the file system encoding is the locale's
1530 encoding. */
1531 else if (Py_FileSystemDefaultEncoding &&
1532 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1533 !PyThreadState_GET()->interp->codecs_initialized)
1534 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1535 PyUnicode_GET_SIZE(unicode),
1536 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537
1538 /* Encode via the codec registry */
1539 v = PyCodec_Encode(unicode, encoding, errors);
1540 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001541 return NULL;
1542
1543 /* The normal path */
1544 if (PyBytes_Check(v))
1545 return v;
1546
1547 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001548 if (PyByteArray_Check(v)) {
1549 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001550 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551 PyOS_snprintf(msg, sizeof(msg),
1552 "encoder %s returned buffer instead of bytes",
1553 encoding);
1554 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001555 Py_DECREF(v);
1556 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001557 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001558
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001559 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1560 Py_DECREF(v);
1561 return b;
1562 }
1563
1564 PyErr_Format(PyExc_TypeError,
1565 "encoder did not return a bytes object (type=%.400s)",
1566 Py_TYPE(v)->tp_name);
1567 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 return NULL;
1569}
1570
1571PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1572 const char *encoding,
1573 const char *errors)
1574{
1575 PyObject *v;
1576
1577 if (!PyUnicode_Check(unicode)) {
1578 PyErr_BadArgument();
1579 goto onError;
1580 }
1581
1582 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001584
1585 /* Encode via the codec registry */
1586 v = PyCodec_Encode(unicode, encoding, errors);
1587 if (v == NULL)
1588 goto onError;
1589 if (!PyUnicode_Check(v)) {
1590 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001591 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001592 Py_TYPE(v)->tp_name);
1593 Py_DECREF(v);
1594 goto onError;
1595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001597
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 return NULL;
1600}
1601
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001602PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001603 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001604{
1605 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001606 if (v)
1607 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001608 if (errors != NULL)
1609 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001610 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001611 PyUnicode_GET_SIZE(unicode),
1612 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001613 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001614 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001615 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001616 return v;
1617}
1618
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001619PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001620PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001622 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1623}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001624
Christian Heimes5894ba72007-11-04 11:43:14 +00001625PyObject*
1626PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1627{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001628 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1629 can be undefined. If it is case, decode using UTF-8. The following assumes
1630 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1631 bootstrapping process where the codecs aren't ready yet.
1632 */
1633 if (Py_FileSystemDefaultEncoding) {
1634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001635 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001636 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001637 }
1638#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001639 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001640 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001641 }
1642#endif
1643 return PyUnicode_Decode(s, size,
1644 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001645 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001646 }
1647 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001648 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650}
1651
Martin v. Löwis011e8422009-05-05 04:43:17 +00001652/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001653 system encoding. The addr param must be a PyObject**.
1654 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001655
1656int
1657PyUnicode_FSConverter(PyObject* arg, void* addr)
1658{
1659 PyObject *output = NULL;
1660 Py_ssize_t size;
1661 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001662 if (arg == NULL) {
1663 Py_DECREF(*(PyObject**)addr);
1664 return 1;
1665 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001666 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001667 output = arg;
1668 Py_INCREF(output);
1669 }
1670 else {
1671 arg = PyUnicode_FromObject(arg);
1672 if (!arg)
1673 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001674 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001675 Py_DECREF(arg);
1676 if (!output)
1677 return 0;
1678 if (!PyBytes_Check(output)) {
1679 Py_DECREF(output);
1680 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1681 return 0;
1682 }
1683 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001684 size = PyBytes_GET_SIZE(output);
1685 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001686 if (size != strlen(data)) {
1687 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1688 Py_DECREF(output);
1689 return 0;
1690 }
1691 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001692 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001693}
1694
1695
Martin v. Löwis5b222132007-06-10 09:51:05 +00001696char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001697_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001698{
Christian Heimesf3863112007-11-22 07:46:41 +00001699 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001700 if (!PyUnicode_Check(unicode)) {
1701 PyErr_BadArgument();
1702 return NULL;
1703 }
Christian Heimesf3863112007-11-22 07:46:41 +00001704 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1705 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001706 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001707 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001708 *psize = PyBytes_GET_SIZE(bytes);
1709 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001710}
1711
1712char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001713_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001714{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001715 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001716}
1717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1719{
1720 if (!PyUnicode_Check(unicode)) {
1721 PyErr_BadArgument();
1722 goto onError;
1723 }
1724 return PyUnicode_AS_UNICODE(unicode);
1725
Benjamin Peterson29060642009-01-31 22:14:21 +00001726 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 return NULL;
1728}
1729
Martin v. Löwis18e16552006-02-15 17:27:45 +00001730Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731{
1732 if (!PyUnicode_Check(unicode)) {
1733 PyErr_BadArgument();
1734 goto onError;
1735 }
1736 return PyUnicode_GET_SIZE(unicode);
1737
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return -1;
1740}
1741
Thomas Wouters78890102000-07-22 19:25:51 +00001742const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001743{
1744 return unicode_default_encoding;
1745}
1746
1747int PyUnicode_SetDefaultEncoding(const char *encoding)
1748{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001749 if (strcmp(encoding, unicode_default_encoding) != 0) {
1750 PyErr_Format(PyExc_ValueError,
1751 "Can only set default encoding to %s",
1752 unicode_default_encoding);
1753 return -1;
1754 }
Fred Drakee4315f52000-05-09 19:53:39 +00001755 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001756}
1757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758/* error handling callback helper:
1759 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001760 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 and adjust various state variables.
1762 return 0 on success, -1 on error
1763*/
1764
1765static
1766int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 const char *encoding, const char *reason,
1768 const char **input, const char **inend, Py_ssize_t *startinpos,
1769 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1770 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001772 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773
1774 PyObject *restuple = NULL;
1775 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001777 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001778 Py_ssize_t requiredsize;
1779 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001781 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 int res = -1;
1784
1785 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 *errorHandler = PyCodec_LookupError(errors);
1787 if (*errorHandler == NULL)
1788 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 }
1790
1791 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001793 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1794 if (*exceptionObject == NULL)
1795 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 }
1797 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1799 goto onError;
1800 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1801 goto onError;
1802 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1803 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 }
1805
1806 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1807 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001810 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 }
1813 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001814 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001815
1816 /* Copy back the bytes variables, which might have been modified by the
1817 callback */
1818 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1819 if (!inputobj)
1820 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001821 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001822 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001823 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001824 *input = PyBytes_AS_STRING(inputobj);
1825 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001826 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001827 /* we can DECREF safely, as the exception has another reference,
1828 so the object won't go away. */
1829 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001833 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1835 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001836 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837
1838 /* need more space? (at least enough for what we
1839 have+the replacement+the rest of the string (starting
1840 at the new input position), so we won't have to check space
1841 when there are no errors in the rest of the string) */
1842 repptr = PyUnicode_AS_UNICODE(repunicode);
1843 repsize = PyUnicode_GET_SIZE(repunicode);
1844 requiredsize = *outpos + repsize + insize-newpos;
1845 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 if (requiredsize<2*outsize)
1847 requiredsize = 2*outsize;
1848 if (_PyUnicode_Resize(output, requiredsize) < 0)
1849 goto onError;
1850 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 }
1852 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001853 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 Py_UNICODE_COPY(*outptr, repptr, repsize);
1855 *outptr += repsize;
1856 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 /* we made it! */
1859 res = 0;
1860
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 Py_XDECREF(restuple);
1863 return res;
1864}
1865
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001866/* --- UTF-7 Codec -------------------------------------------------------- */
1867
Antoine Pitrou244651a2009-05-04 18:56:13 +00001868/* See RFC2152 for details. We encode conservatively and decode liberally. */
1869
1870/* Three simple macros defining base-64. */
1871
1872/* Is c a base-64 character? */
1873
1874#define IS_BASE64(c) \
1875 (((c) >= 'A' && (c) <= 'Z') || \
1876 ((c) >= 'a' && (c) <= 'z') || \
1877 ((c) >= '0' && (c) <= '9') || \
1878 (c) == '+' || (c) == '/')
1879
1880/* given that c is a base-64 character, what is its base-64 value? */
1881
1882#define FROM_BASE64(c) \
1883 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1884 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1885 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1886 (c) == '+' ? 62 : 63)
1887
1888/* What is the base-64 character of the bottom 6 bits of n? */
1889
1890#define TO_BASE64(n) \
1891 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1892
1893/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1894 * decoded as itself. We are permissive on decoding; the only ASCII
1895 * byte not decoding to itself is the + which begins a base64
1896 * string. */
1897
1898#define DECODE_DIRECT(c) \
1899 ((c) <= 127 && (c) != '+')
1900
1901/* The UTF-7 encoder treats ASCII characters differently according to
1902 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1903 * the above). See RFC2152. This array identifies these different
1904 * sets:
1905 * 0 : "Set D"
1906 * alphanumeric and '(),-./:?
1907 * 1 : "Set O"
1908 * !"#$%&*;<=>@[]^_`{|}
1909 * 2 : "whitespace"
1910 * ht nl cr sp
1911 * 3 : special (must be base64 encoded)
1912 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1913 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914
Tim Petersced69f82003-09-16 20:30:58 +00001915static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001916char utf7_category[128] = {
1917/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1918 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1919/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1920 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1921/* sp ! " # $ % & ' ( ) * + , - . / */
1922 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1923/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1924 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1925/* @ A B C D E F G H I J K L M N O */
1926 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1927/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1928 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1929/* ` a b c d e f g h i j k l m n o */
1930 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1931/* p q r s t u v w x y z { | } ~ del */
1932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001933};
1934
Antoine Pitrou244651a2009-05-04 18:56:13 +00001935/* ENCODE_DIRECT: this character should be encoded as itself. The
1936 * answer depends on whether we are encoding set O as itself, and also
1937 * on whether we are encoding whitespace as itself. RFC2152 makes it
1938 * clear that the answers to these questions vary between
1939 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001940
Antoine Pitrou244651a2009-05-04 18:56:13 +00001941#define ENCODE_DIRECT(c, directO, directWS) \
1942 ((c) < 128 && (c) > 0 && \
1943 ((utf7_category[(c)] == 0) || \
1944 (directWS && (utf7_category[(c)] == 2)) || \
1945 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001947PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 Py_ssize_t size,
1949 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001951 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1952}
1953
Antoine Pitrou244651a2009-05-04 18:56:13 +00001954/* The decoder. The only state we preserve is our read position,
1955 * i.e. how many characters we have consumed. So if we end in the
1956 * middle of a shift sequence we have to back off the read position
1957 * and the output to the beginning of the sequence, otherwise we lose
1958 * all the shift state (seen bits, number of bits seen, high
1959 * surrogate). */
1960
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001961PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001962 Py_ssize_t size,
1963 const char *errors,
1964 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001965{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001966 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t startinpos;
1968 Py_ssize_t endinpos;
1969 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001970 const char *e;
1971 PyUnicodeObject *unicode;
1972 Py_UNICODE *p;
1973 const char *errmsg = "";
1974 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 Py_UNICODE *shiftOutStart;
1976 unsigned int base64bits = 0;
1977 unsigned long base64buffer = 0;
1978 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 PyObject *errorHandler = NULL;
1980 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001981
1982 unicode = _PyUnicode_New(size);
1983 if (!unicode)
1984 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001985 if (size == 0) {
1986 if (consumed)
1987 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001988 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001989 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001990
1991 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001992 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993 e = s + size;
1994
1995 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001996 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001998 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001999
Antoine Pitrou244651a2009-05-04 18:56:13 +00002000 if (inShift) { /* in a base-64 section */
2001 if (IS_BASE64(ch)) { /* consume a base-64 character */
2002 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2003 base64bits += 6;
2004 s++;
2005 if (base64bits >= 16) {
2006 /* we have enough bits for a UTF-16 value */
2007 Py_UNICODE outCh = (Py_UNICODE)
2008 (base64buffer >> (base64bits-16));
2009 base64bits -= 16;
2010 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2011 if (surrogate) {
2012 /* expecting a second surrogate */
2013 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2014#ifdef Py_UNICODE_WIDE
2015 *p++ = (((surrogate & 0x3FF)<<10)
2016 | (outCh & 0x3FF)) + 0x10000;
2017#else
2018 *p++ = surrogate;
2019 *p++ = outCh;
2020#endif
2021 surrogate = 0;
2022 }
2023 else {
2024 surrogate = 0;
2025 errmsg = "second surrogate missing";
2026 goto utf7Error;
2027 }
2028 }
2029 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2030 /* first surrogate */
2031 surrogate = outCh;
2032 }
2033 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2034 errmsg = "unexpected second surrogate";
2035 goto utf7Error;
2036 }
2037 else {
2038 *p++ = outCh;
2039 }
2040 }
2041 }
2042 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 inShift = 0;
2044 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002045 if (surrogate) {
2046 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002047 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002049 if (base64bits > 0) { /* left-over bits */
2050 if (base64bits >= 6) {
2051 /* We've seen at least one base-64 character */
2052 errmsg = "partial character in shift sequence";
2053 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002054 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002055 else {
2056 /* Some bits remain; they should be zero */
2057 if (base64buffer != 0) {
2058 errmsg = "non-zero padding bits in shift sequence";
2059 goto utf7Error;
2060 }
2061 }
2062 }
2063 if (ch != '-') {
2064 /* '-' is absorbed; other terminating
2065 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 *p++ = ch;
2067 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002068 }
2069 }
2070 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002072 s++; /* consume '+' */
2073 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002074 s++;
2075 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002076 }
2077 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002079 shiftOutStart = p;
2080 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002081 }
2082 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002084 *p++ = ch;
2085 s++;
2086 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002087 else {
2088 startinpos = s-starts;
2089 s++;
2090 errmsg = "unexpected special character";
2091 goto utf7Error;
2092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002093 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002094utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 outpos = p-PyUnicode_AS_UNICODE(unicode);
2096 endinpos = s-starts;
2097 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 errors, &errorHandler,
2099 "utf7", errmsg,
2100 &starts, &e, &startinpos, &endinpos, &exc, &s,
2101 &unicode, &outpos, &p))
2102 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002103 }
2104
Antoine Pitrou244651a2009-05-04 18:56:13 +00002105 /* end of string */
2106
2107 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2108 /* if we're in an inconsistent state, that's an error */
2109 if (surrogate ||
2110 (base64bits >= 6) ||
2111 (base64bits > 0 && base64buffer != 0)) {
2112 outpos = p-PyUnicode_AS_UNICODE(unicode);
2113 endinpos = size;
2114 if (unicode_decode_call_errorhandler(
2115 errors, &errorHandler,
2116 "utf7", "unterminated shift sequence",
2117 &starts, &e, &startinpos, &endinpos, &exc, &s,
2118 &unicode, &outpos, &p))
2119 goto onError;
2120 if (s < e)
2121 goto restart;
2122 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002123 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002124
2125 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002126 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002127 if (inShift) {
2128 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002129 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130 }
2131 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002132 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002134 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002136 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002137 goto onError;
2138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 Py_XDECREF(errorHandler);
2140 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141 return (PyObject *)unicode;
2142
Benjamin Peterson29060642009-01-31 22:14:21 +00002143 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002144 Py_XDECREF(errorHandler);
2145 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146 Py_DECREF(unicode);
2147 return NULL;
2148}
2149
2150
2151PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002152 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002153 int base64SetO,
2154 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002155 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002157 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002158 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002159 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002160 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162 unsigned int base64bits = 0;
2163 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 char * out;
2165 char * start;
2166
2167 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002168 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002170 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002171 return PyErr_NoMemory();
2172
Antoine Pitrou244651a2009-05-04 18:56:13 +00002173 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002174 if (v == NULL)
2175 return NULL;
2176
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002177 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002178 for (;i < size; ++i) {
2179 Py_UNICODE ch = s[i];
2180
Antoine Pitrou244651a2009-05-04 18:56:13 +00002181 if (inShift) {
2182 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2183 /* shifting out */
2184 if (base64bits) { /* output remaining bits */
2185 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2186 base64buffer = 0;
2187 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002188 }
2189 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002190 /* Characters not in the BASE64 set implicitly unshift the sequence
2191 so no '-' is required, except if the character is itself a '-' */
2192 if (IS_BASE64(ch) || ch == '-') {
2193 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002195 *out++ = (char) ch;
2196 }
2197 else {
2198 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002199 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201 else { /* not in a shift sequence */
2202 if (ch == '+') {
2203 *out++ = '+';
2204 *out++ = '-';
2205 }
2206 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2207 *out++ = (char) ch;
2208 }
2209 else {
2210 *out++ = '+';
2211 inShift = 1;
2212 goto encode_char;
2213 }
2214 }
2215 continue;
2216encode_char:
2217#ifdef Py_UNICODE_WIDE
2218 if (ch >= 0x10000) {
2219 /* code first surrogate */
2220 base64bits += 16;
2221 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2222 while (base64bits >= 6) {
2223 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2224 base64bits -= 6;
2225 }
2226 /* prepare second surrogate */
2227 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2228 }
2229#endif
2230 base64bits += 16;
2231 base64buffer = (base64buffer << 16) | ch;
2232 while (base64bits >= 6) {
2233 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2234 base64bits -= 6;
2235 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002236 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002237 if (base64bits)
2238 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2239 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002240 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002241 if (_PyBytes_Resize(&v, out - start) < 0)
2242 return NULL;
2243 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002244}
2245
Antoine Pitrou244651a2009-05-04 18:56:13 +00002246#undef IS_BASE64
2247#undef FROM_BASE64
2248#undef TO_BASE64
2249#undef DECODE_DIRECT
2250#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002251
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252/* --- UTF-8 Codec -------------------------------------------------------- */
2253
Tim Petersced69f82003-09-16 20:30:58 +00002254static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255char utf8_code_length[256] = {
2256 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2257 illegal prefix. see RFC 2279 for details */
2258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2259 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2260 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2261 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2270 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2271 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2272 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2273 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2274};
2275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 Py_ssize_t size,
2278 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279{
Walter Dörwald69652032004-09-07 20:24:22 +00002280 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2281}
2282
Antoine Pitrouab868312009-01-10 15:40:25 +00002283/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2284#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2285
2286/* Mask to quickly check whether a C 'long' contains a
2287 non-ASCII, UTF8-encoded char. */
2288#if (SIZEOF_LONG == 8)
2289# define ASCII_CHAR_MASK 0x8080808080808080L
2290#elif (SIZEOF_LONG == 4)
2291# define ASCII_CHAR_MASK 0x80808080L
2292#else
2293# error C 'long' size should be either 4 or 8!
2294#endif
2295
Walter Dörwald69652032004-09-07 20:24:22 +00002296PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002297 Py_ssize_t size,
2298 const char *errors,
2299 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002300{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002303 Py_ssize_t startinpos;
2304 Py_ssize_t endinpos;
2305 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002306 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 PyUnicodeObject *unicode;
2308 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002309 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 PyObject *errorHandler = NULL;
2311 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312
2313 /* Note: size will always be longer than the resulting Unicode
2314 character count */
2315 unicode = _PyUnicode_New(size);
2316 if (!unicode)
2317 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002318 if (size == 0) {
2319 if (consumed)
2320 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323
2324 /* Unpack UTF-8 encoded data */
2325 p = unicode->str;
2326 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002327 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328
2329 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002330 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331
2332 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002333 /* Fast path for runs of ASCII characters. Given that common UTF-8
2334 input will consist of an overwhelming majority of ASCII
2335 characters, we try to optimize for this case by checking
2336 as many characters as a C 'long' can contain.
2337 First, check if we can do an aligned read, as most CPUs have
2338 a penalty for unaligned reads.
2339 */
2340 if (!((size_t) s & LONG_PTR_MASK)) {
2341 /* Help register allocation */
2342 register const char *_s = s;
2343 register Py_UNICODE *_p = p;
2344 while (_s < aligned_end) {
2345 /* Read a whole long at a time (either 4 or 8 bytes),
2346 and do a fast unrolled copy if it only contains ASCII
2347 characters. */
2348 unsigned long data = *(unsigned long *) _s;
2349 if (data & ASCII_CHAR_MASK)
2350 break;
2351 _p[0] = (unsigned char) _s[0];
2352 _p[1] = (unsigned char) _s[1];
2353 _p[2] = (unsigned char) _s[2];
2354 _p[3] = (unsigned char) _s[3];
2355#if (SIZEOF_LONG == 8)
2356 _p[4] = (unsigned char) _s[4];
2357 _p[5] = (unsigned char) _s[5];
2358 _p[6] = (unsigned char) _s[6];
2359 _p[7] = (unsigned char) _s[7];
2360#endif
2361 _s += SIZEOF_LONG;
2362 _p += SIZEOF_LONG;
2363 }
2364 s = _s;
2365 p = _p;
2366 if (s == e)
2367 break;
2368 ch = (unsigned char)*s;
2369 }
2370 }
2371
2372 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002373 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 s++;
2375 continue;
2376 }
2377
2378 n = utf8_code_length[ch];
2379
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002380 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002381 if (consumed)
2382 break;
2383 else {
2384 errmsg = "unexpected end of data";
2385 startinpos = s-starts;
2386 endinpos = size;
2387 goto utf8Error;
2388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 switch (n) {
2392
2393 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002394 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 startinpos = s-starts;
2396 endinpos = startinpos+1;
2397 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398
2399 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 startinpos = s-starts;
2402 endinpos = startinpos+1;
2403 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404
2405 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002406 if ((s[1] & 0xc0) != 0x80) {
2407 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002408 startinpos = s-starts;
2409 endinpos = startinpos+2;
2410 goto utf8Error;
2411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002413 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 startinpos = s-starts;
2415 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002416 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002417 goto utf8Error;
2418 }
2419 else
2420 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 break;
2422
2423 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002424 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002425 (s[2] & 0xc0) != 0x80) {
2426 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 startinpos = s-starts;
2428 endinpos = startinpos+3;
2429 goto utf8Error;
2430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002432 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002433 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 startinpos = s-starts;
2435 endinpos = startinpos+3;
2436 goto utf8Error;
2437 }
2438 else
2439 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002440 break;
2441
2442 case 4:
2443 if ((s[1] & 0xc0) != 0x80 ||
2444 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002445 (s[3] & 0xc0) != 0x80) {
2446 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 startinpos = s-starts;
2448 endinpos = startinpos+4;
2449 goto utf8Error;
2450 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002451 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002453 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002455 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002456 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 UTF-16 */
2458 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002459 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002460 startinpos = s-starts;
2461 endinpos = startinpos+4;
2462 goto utf8Error;
2463 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002464#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002467 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002468
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002469 /* translate from 10000..10FFFF to 0..FFFF */
2470 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002471
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002472 /* high surrogate = top 10 bits added to D800 */
2473 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002474
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002475 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002476 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 break;
2479
2480 default:
2481 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002482 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002483 startinpos = s-starts;
2484 endinpos = startinpos+n;
2485 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 }
2487 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002489
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 utf8Error:
2491 outpos = p-PyUnicode_AS_UNICODE(unicode);
2492 if (unicode_decode_call_errorhandler(
2493 errors, &errorHandler,
2494 "utf8", errmsg,
2495 &starts, &e, &startinpos, &endinpos, &exc, &s,
2496 &unicode, &outpos, &p))
2497 goto onError;
2498 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 }
Walter Dörwald69652032004-09-07 20:24:22 +00002500 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002501 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502
2503 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002504 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 goto onError;
2506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 Py_XDECREF(errorHandler);
2508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 return (PyObject *)unicode;
2510
Benjamin Peterson29060642009-01-31 22:14:21 +00002511 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 Py_XDECREF(errorHandler);
2513 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 Py_DECREF(unicode);
2515 return NULL;
2516}
2517
Antoine Pitrouab868312009-01-10 15:40:25 +00002518#undef ASCII_CHAR_MASK
2519
2520
Tim Peters602f7402002-04-27 18:03:26 +00002521/* Allocation strategy: if the string is short, convert into a stack buffer
2522 and allocate exactly as much space needed at the end. Else allocate the
2523 maximum possible needed (4 result bytes per Unicode character), and return
2524 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002525*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002526PyObject *
2527PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002528 Py_ssize_t size,
2529 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530{
Tim Peters602f7402002-04-27 18:03:26 +00002531#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002532
Guido van Rossum98297ee2007-11-06 21:34:58 +00002533 Py_ssize_t i; /* index into s of next input byte */
2534 PyObject *result; /* result string object */
2535 char *p; /* next free byte in output buffer */
2536 Py_ssize_t nallocated; /* number of result bytes allocated */
2537 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002538 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002539 PyObject *errorHandler = NULL;
2540 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002541
Tim Peters602f7402002-04-27 18:03:26 +00002542 assert(s != NULL);
2543 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544
Tim Peters602f7402002-04-27 18:03:26 +00002545 if (size <= MAX_SHORT_UNICHARS) {
2546 /* Write into the stack buffer; nallocated can't overflow.
2547 * At the end, we'll allocate exactly as much heap space as it
2548 * turns out we need.
2549 */
2550 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002551 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002552 p = stackbuf;
2553 }
2554 else {
2555 /* Overallocate on the heap, and give the excess back at the end. */
2556 nallocated = size * 4;
2557 if (nallocated / 4 != size) /* overflow! */
2558 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002559 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002560 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002561 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002562 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002563 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002564
Tim Peters602f7402002-04-27 18:03:26 +00002565 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002566 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002567
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002568 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002569 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002573 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002574 *p++ = (char)(0xc0 | (ch >> 6));
2575 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002576 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002577#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002578 /* Special case: check for high and low surrogate */
2579 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2580 Py_UCS4 ch2 = s[i];
2581 /* Combine the two surrogates to form a UCS4 value */
2582 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2583 i++;
2584
2585 /* Encode UCS4 Unicode ordinals */
2586 *p++ = (char)(0xf0 | (ch >> 18));
2587 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002588 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2589 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002590 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002591#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002592 Py_ssize_t newpos;
2593 PyObject *rep;
2594 Py_ssize_t repsize, k;
2595 rep = unicode_encode_call_errorhandler
2596 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2597 s, size, &exc, i-1, i, &newpos);
2598 if (!rep)
2599 goto error;
2600
2601 if (PyBytes_Check(rep))
2602 repsize = PyBytes_GET_SIZE(rep);
2603 else
2604 repsize = PyUnicode_GET_SIZE(rep);
2605
2606 if (repsize > 4) {
2607 Py_ssize_t offset;
2608
2609 if (result == NULL)
2610 offset = p - stackbuf;
2611 else
2612 offset = p - PyBytes_AS_STRING(result);
2613
2614 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2615 /* integer overflow */
2616 PyErr_NoMemory();
2617 goto error;
2618 }
2619 nallocated += repsize - 4;
2620 if (result != NULL) {
2621 if (_PyBytes_Resize(&result, nallocated) < 0)
2622 goto error;
2623 } else {
2624 result = PyBytes_FromStringAndSize(NULL, nallocated);
2625 if (result == NULL)
2626 goto error;
2627 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2628 }
2629 p = PyBytes_AS_STRING(result) + offset;
2630 }
2631
2632 if (PyBytes_Check(rep)) {
2633 char *prep = PyBytes_AS_STRING(rep);
2634 for(k = repsize; k > 0; k--)
2635 *p++ = *prep++;
2636 } else /* rep is unicode */ {
2637 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2638 Py_UNICODE c;
2639
2640 for(k=0; k<repsize; k++) {
2641 c = prep[k];
2642 if (0x80 <= c) {
2643 raise_encode_exception(&exc, "utf-8", s, size,
2644 i-1, i, "surrogates not allowed");
2645 goto error;
2646 }
2647 *p++ = (char)prep[k];
2648 }
2649 }
2650 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002651#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002652 }
Victor Stinner445a6232010-04-22 20:01:57 +00002653#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002654 } else if (ch < 0x10000) {
2655 *p++ = (char)(0xe0 | (ch >> 12));
2656 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2657 *p++ = (char)(0x80 | (ch & 0x3f));
2658 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002659 /* Encode UCS4 Unicode ordinals */
2660 *p++ = (char)(0xf0 | (ch >> 18));
2661 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2662 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2663 *p++ = (char)(0x80 | (ch & 0x3f));
2664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002666
Guido van Rossum98297ee2007-11-06 21:34:58 +00002667 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002668 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002669 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002670 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002671 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002672 }
2673 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002674 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002675 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002676 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002677 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002678 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002679 Py_XDECREF(errorHandler);
2680 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002681 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002682 error:
2683 Py_XDECREF(errorHandler);
2684 Py_XDECREF(exc);
2685 Py_XDECREF(result);
2686 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002687
Tim Peters602f7402002-04-27 18:03:26 +00002688#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689}
2690
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 if (!PyUnicode_Check(unicode)) {
2694 PyErr_BadArgument();
2695 return NULL;
2696 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002697 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002698 PyUnicode_GET_SIZE(unicode),
2699 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700}
2701
Walter Dörwald41980ca2007-08-16 21:55:45 +00002702/* --- UTF-32 Codec ------------------------------------------------------- */
2703
2704PyObject *
2705PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002706 Py_ssize_t size,
2707 const char *errors,
2708 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002709{
2710 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2711}
2712
2713PyObject *
2714PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002715 Py_ssize_t size,
2716 const char *errors,
2717 int *byteorder,
2718 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719{
2720 const char *starts = s;
2721 Py_ssize_t startinpos;
2722 Py_ssize_t endinpos;
2723 Py_ssize_t outpos;
2724 PyUnicodeObject *unicode;
2725 Py_UNICODE *p;
2726#ifndef Py_UNICODE_WIDE
2727 int i, pairs;
2728#else
2729 const int pairs = 0;
2730#endif
2731 const unsigned char *q, *e;
2732 int bo = 0; /* assume native ordering by default */
2733 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734 /* Offsets from q for retrieving bytes in the right order. */
2735#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2736 int iorder[] = {0, 1, 2, 3};
2737#else
2738 int iorder[] = {3, 2, 1, 0};
2739#endif
2740 PyObject *errorHandler = NULL;
2741 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002742 /* On narrow builds we split characters outside the BMP into two
2743 codepoints => count how much extra space we need. */
2744#ifndef Py_UNICODE_WIDE
2745 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 if (((Py_UCS4 *)s)[i] >= 0x10000)
2747 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002748#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002749
2750 /* This might be one to much, because of a BOM */
2751 unicode = _PyUnicode_New((size+3)/4+pairs);
2752 if (!unicode)
2753 return NULL;
2754 if (size == 0)
2755 return (PyObject *)unicode;
2756
2757 /* Unpack UTF-32 encoded data */
2758 p = unicode->str;
2759 q = (unsigned char *)s;
2760 e = q + size;
2761
2762 if (byteorder)
2763 bo = *byteorder;
2764
2765 /* Check for BOM marks (U+FEFF) in the input and adjust current
2766 byte order setting accordingly. In native mode, the leading BOM
2767 mark is skipped, in all other modes, it is copied to the output
2768 stream as-is (giving a ZWNBSP character). */
2769 if (bo == 0) {
2770 if (size >= 4) {
2771 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002773#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002774 if (bom == 0x0000FEFF) {
2775 q += 4;
2776 bo = -1;
2777 }
2778 else if (bom == 0xFFFE0000) {
2779 q += 4;
2780 bo = 1;
2781 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 if (bom == 0x0000FEFF) {
2784 q += 4;
2785 bo = 1;
2786 }
2787 else if (bom == 0xFFFE0000) {
2788 q += 4;
2789 bo = -1;
2790 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002791#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002793 }
2794
2795 if (bo == -1) {
2796 /* force LE */
2797 iorder[0] = 0;
2798 iorder[1] = 1;
2799 iorder[2] = 2;
2800 iorder[3] = 3;
2801 }
2802 else if (bo == 1) {
2803 /* force BE */
2804 iorder[0] = 3;
2805 iorder[1] = 2;
2806 iorder[2] = 1;
2807 iorder[3] = 0;
2808 }
2809
2810 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002811 Py_UCS4 ch;
2812 /* remaining bytes at the end? (size should be divisible by 4) */
2813 if (e-q<4) {
2814 if (consumed)
2815 break;
2816 errmsg = "truncated data";
2817 startinpos = ((const char *)q)-starts;
2818 endinpos = ((const char *)e)-starts;
2819 goto utf32Error;
2820 /* The remaining input chars are ignored if the callback
2821 chooses to skip the input */
2822 }
2823 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2824 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 if (ch >= 0x110000)
2827 {
2828 errmsg = "codepoint not in range(0x110000)";
2829 startinpos = ((const char *)q)-starts;
2830 endinpos = startinpos+4;
2831 goto utf32Error;
2832 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002833#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 if (ch >= 0x10000)
2835 {
2836 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2837 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2838 }
2839 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002840#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 *p++ = ch;
2842 q += 4;
2843 continue;
2844 utf32Error:
2845 outpos = p-PyUnicode_AS_UNICODE(unicode);
2846 if (unicode_decode_call_errorhandler(
2847 errors, &errorHandler,
2848 "utf32", errmsg,
2849 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2850 &unicode, &outpos, &p))
2851 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002852 }
2853
2854 if (byteorder)
2855 *byteorder = bo;
2856
2857 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002859
2860 /* Adjust length */
2861 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2862 goto onError;
2863
2864 Py_XDECREF(errorHandler);
2865 Py_XDECREF(exc);
2866 return (PyObject *)unicode;
2867
Benjamin Peterson29060642009-01-31 22:14:21 +00002868 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002869 Py_DECREF(unicode);
2870 Py_XDECREF(errorHandler);
2871 Py_XDECREF(exc);
2872 return NULL;
2873}
2874
2875PyObject *
2876PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 Py_ssize_t size,
2878 const char *errors,
2879 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002880{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002881 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002882 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002883 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002884#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002885 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002886#else
2887 const int pairs = 0;
2888#endif
2889 /* Offsets from p for storing byte pairs in the right order. */
2890#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2891 int iorder[] = {0, 1, 2, 3};
2892#else
2893 int iorder[] = {3, 2, 1, 0};
2894#endif
2895
Benjamin Peterson29060642009-01-31 22:14:21 +00002896#define STORECHAR(CH) \
2897 do { \
2898 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2899 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2900 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2901 p[iorder[0]] = (CH) & 0xff; \
2902 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002903 } while(0)
2904
2905 /* In narrow builds we can output surrogate pairs as one codepoint,
2906 so we need less space. */
2907#ifndef Py_UNICODE_WIDE
2908 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2910 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2911 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002912#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002913 nsize = (size - pairs + (byteorder == 0));
2914 bytesize = nsize * 4;
2915 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002916 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002917 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918 if (v == NULL)
2919 return NULL;
2920
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002921 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002922 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002924 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002925 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002926
2927 if (byteorder == -1) {
2928 /* force LE */
2929 iorder[0] = 0;
2930 iorder[1] = 1;
2931 iorder[2] = 2;
2932 iorder[3] = 3;
2933 }
2934 else if (byteorder == 1) {
2935 /* force BE */
2936 iorder[0] = 3;
2937 iorder[1] = 2;
2938 iorder[2] = 1;
2939 iorder[3] = 0;
2940 }
2941
2942 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002944#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2946 Py_UCS4 ch2 = *s;
2947 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2948 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2949 s++;
2950 size--;
2951 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002953#endif
2954 STORECHAR(ch);
2955 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002956
2957 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002958 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002959#undef STORECHAR
2960}
2961
2962PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2963{
2964 if (!PyUnicode_Check(unicode)) {
2965 PyErr_BadArgument();
2966 return NULL;
2967 }
2968 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 PyUnicode_GET_SIZE(unicode),
2970 NULL,
2971 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002972}
2973
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974/* --- UTF-16 Codec ------------------------------------------------------- */
2975
Tim Peters772747b2001-08-09 22:21:55 +00002976PyObject *
2977PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 Py_ssize_t size,
2979 const char *errors,
2980 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981{
Walter Dörwald69652032004-09-07 20:24:22 +00002982 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2983}
2984
Antoine Pitrouab868312009-01-10 15:40:25 +00002985/* Two masks for fast checking of whether a C 'long' may contain
2986 UTF16-encoded surrogate characters. This is an efficient heuristic,
2987 assuming that non-surrogate characters with a code point >= 0x8000 are
2988 rare in most input.
2989 FAST_CHAR_MASK is used when the input is in native byte ordering,
2990 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002991*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002992#if (SIZEOF_LONG == 8)
2993# define FAST_CHAR_MASK 0x8000800080008000L
2994# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2995#elif (SIZEOF_LONG == 4)
2996# define FAST_CHAR_MASK 0x80008000L
2997# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2998#else
2999# error C 'long' size should be either 4 or 8!
3000#endif
3001
Walter Dörwald69652032004-09-07 20:24:22 +00003002PyObject *
3003PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 Py_ssize_t size,
3005 const char *errors,
3006 int *byteorder,
3007 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003008{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t startinpos;
3011 Py_ssize_t endinpos;
3012 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 PyUnicodeObject *unicode;
3014 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003015 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003016 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003017 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003018 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003019 /* Offsets from q for retrieving byte pairs in the right order. */
3020#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3021 int ihi = 1, ilo = 0;
3022#else
3023 int ihi = 0, ilo = 1;
3024#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 PyObject *errorHandler = NULL;
3026 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 /* Note: size will always be longer than the resulting Unicode
3029 character count */
3030 unicode = _PyUnicode_New(size);
3031 if (!unicode)
3032 return NULL;
3033 if (size == 0)
3034 return (PyObject *)unicode;
3035
3036 /* Unpack UTF-16 encoded data */
3037 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003038 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003039 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040
3041 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003042 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003044 /* Check for BOM marks (U+FEFF) in the input and adjust current
3045 byte order setting accordingly. In native mode, the leading BOM
3046 mark is skipped, in all other modes, it is copied to the output
3047 stream as-is (giving a ZWNBSP character). */
3048 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003049 if (size >= 2) {
3050 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003051#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 if (bom == 0xFEFF) {
3053 q += 2;
3054 bo = -1;
3055 }
3056 else if (bom == 0xFFFE) {
3057 q += 2;
3058 bo = 1;
3059 }
Tim Petersced69f82003-09-16 20:30:58 +00003060#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 if (bom == 0xFEFF) {
3062 q += 2;
3063 bo = 1;
3064 }
3065 else if (bom == 0xFFFE) {
3066 q += 2;
3067 bo = -1;
3068 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003069#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072
Tim Peters772747b2001-08-09 22:21:55 +00003073 if (bo == -1) {
3074 /* force LE */
3075 ihi = 1;
3076 ilo = 0;
3077 }
3078 else if (bo == 1) {
3079 /* force BE */
3080 ihi = 0;
3081 ilo = 1;
3082 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3084 native_ordering = ilo < ihi;
3085#else
3086 native_ordering = ilo > ihi;
3087#endif
Tim Peters772747b2001-08-09 22:21:55 +00003088
Antoine Pitrouab868312009-01-10 15:40:25 +00003089 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003090 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003092 /* First check for possible aligned read of a C 'long'. Unaligned
3093 reads are more expensive, better to defer to another iteration. */
3094 if (!((size_t) q & LONG_PTR_MASK)) {
3095 /* Fast path for runs of non-surrogate chars. */
3096 register const unsigned char *_q = q;
3097 Py_UNICODE *_p = p;
3098 if (native_ordering) {
3099 /* Native ordering is simple: as long as the input cannot
3100 possibly contain a surrogate char, do an unrolled copy
3101 of several 16-bit code points to the target object.
3102 The non-surrogate check is done on several input bytes
3103 at a time (as many as a C 'long' can contain). */
3104 while (_q < aligned_end) {
3105 unsigned long data = * (unsigned long *) _q;
3106 if (data & FAST_CHAR_MASK)
3107 break;
3108 _p[0] = ((unsigned short *) _q)[0];
3109 _p[1] = ((unsigned short *) _q)[1];
3110#if (SIZEOF_LONG == 8)
3111 _p[2] = ((unsigned short *) _q)[2];
3112 _p[3] = ((unsigned short *) _q)[3];
3113#endif
3114 _q += SIZEOF_LONG;
3115 _p += SIZEOF_LONG / 2;
3116 }
3117 }
3118 else {
3119 /* Byteswapped ordering is similar, but we must decompose
3120 the copy bytewise, and take care of zero'ing out the
3121 upper bytes if the target object is in 32-bit units
3122 (that is, in UCS-4 builds). */
3123 while (_q < aligned_end) {
3124 unsigned long data = * (unsigned long *) _q;
3125 if (data & SWAPPED_FAST_CHAR_MASK)
3126 break;
3127 /* Zero upper bytes in UCS-4 builds */
3128#if (Py_UNICODE_SIZE > 2)
3129 _p[0] = 0;
3130 _p[1] = 0;
3131#if (SIZEOF_LONG == 8)
3132 _p[2] = 0;
3133 _p[3] = 0;
3134#endif
3135#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003136 /* Issue #4916; UCS-4 builds on big endian machines must
3137 fill the two last bytes of each 4-byte unit. */
3138#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3139# define OFF 2
3140#else
3141# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003142#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003143 ((unsigned char *) _p)[OFF + 1] = _q[0];
3144 ((unsigned char *) _p)[OFF + 0] = _q[1];
3145 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3146 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3147#if (SIZEOF_LONG == 8)
3148 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3149 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3150 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3151 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3152#endif
3153#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003154 _q += SIZEOF_LONG;
3155 _p += SIZEOF_LONG / 2;
3156 }
3157 }
3158 p = _p;
3159 q = _q;
3160 if (q >= e)
3161 break;
3162 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164
Benjamin Peterson14339b62009-01-31 16:36:08 +00003165 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003166
3167 if (ch < 0xD800 || ch > 0xDFFF) {
3168 *p++ = ch;
3169 continue;
3170 }
3171
3172 /* UTF-16 code pair: */
3173 if (q > e) {
3174 errmsg = "unexpected end of data";
3175 startinpos = (((const char *)q) - 2) - starts;
3176 endinpos = ((const char *)e) + 1 - starts;
3177 goto utf16Error;
3178 }
3179 if (0xD800 <= ch && ch <= 0xDBFF) {
3180 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3181 q += 2;
3182 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003183#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 *p++ = ch;
3185 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003186#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003188#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 continue;
3190 }
3191 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003192 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003193 startinpos = (((const char *)q)-4)-starts;
3194 endinpos = startinpos+2;
3195 goto utf16Error;
3196 }
3197
Benjamin Peterson14339b62009-01-31 16:36:08 +00003198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 errmsg = "illegal encoding";
3200 startinpos = (((const char *)q)-2)-starts;
3201 endinpos = startinpos+2;
3202 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003203
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 utf16Error:
3205 outpos = p - PyUnicode_AS_UNICODE(unicode);
3206 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003207 errors,
3208 &errorHandler,
3209 "utf16", errmsg,
3210 &starts,
3211 (const char **)&e,
3212 &startinpos,
3213 &endinpos,
3214 &exc,
3215 (const char **)&q,
3216 &unicode,
3217 &outpos,
3218 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003221 /* remaining byte at the end? (size should be even) */
3222 if (e == q) {
3223 if (!consumed) {
3224 errmsg = "truncated data";
3225 startinpos = ((const char *)q) - starts;
3226 endinpos = ((const char *)e) + 1 - starts;
3227 outpos = p - PyUnicode_AS_UNICODE(unicode);
3228 if (unicode_decode_call_errorhandler(
3229 errors,
3230 &errorHandler,
3231 "utf16", errmsg,
3232 &starts,
3233 (const char **)&e,
3234 &startinpos,
3235 &endinpos,
3236 &exc,
3237 (const char **)&q,
3238 &unicode,
3239 &outpos,
3240 &p))
3241 goto onError;
3242 /* The remaining input chars are ignored if the callback
3243 chooses to skip the input */
3244 }
3245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246
3247 if (byteorder)
3248 *byteorder = bo;
3249
Walter Dörwald69652032004-09-07 20:24:22 +00003250 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003252
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003254 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 goto onError;
3256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return (PyObject *)unicode;
3260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return NULL;
3266}
3267
Antoine Pitrouab868312009-01-10 15:40:25 +00003268#undef FAST_CHAR_MASK
3269#undef SWAPPED_FAST_CHAR_MASK
3270
Tim Peters772747b2001-08-09 22:21:55 +00003271PyObject *
3272PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 Py_ssize_t size,
3274 const char *errors,
3275 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003277 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003278 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003280#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003281 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003282#else
3283 const int pairs = 0;
3284#endif
Tim Peters772747b2001-08-09 22:21:55 +00003285 /* Offsets from p for storing byte pairs in the right order. */
3286#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3287 int ihi = 1, ilo = 0;
3288#else
3289 int ihi = 0, ilo = 1;
3290#endif
3291
Benjamin Peterson29060642009-01-31 22:14:21 +00003292#define STORECHAR(CH) \
3293 do { \
3294 p[ihi] = ((CH) >> 8) & 0xff; \
3295 p[ilo] = (CH) & 0xff; \
3296 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003297 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003299#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003300 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003301 if (s[i] >= 0x10000)
3302 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003303#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003304 /* 2 * (size + pairs + (byteorder == 0)) */
3305 if (size > PY_SSIZE_T_MAX ||
3306 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003308 nsize = size + pairs + (byteorder == 0);
3309 bytesize = nsize * 2;
3310 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003312 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (v == NULL)
3314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003316 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003319 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003320 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003321
3322 if (byteorder == -1) {
3323 /* force LE */
3324 ihi = 1;
3325 ilo = 0;
3326 }
3327 else if (byteorder == 1) {
3328 /* force BE */
3329 ihi = 0;
3330 ilo = 1;
3331 }
3332
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003333 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 Py_UNICODE ch = *s++;
3335 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003336#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 if (ch >= 0x10000) {
3338 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3339 ch = 0xD800 | ((ch-0x10000) >> 10);
3340 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003341#endif
Tim Peters772747b2001-08-09 22:21:55 +00003342 STORECHAR(ch);
3343 if (ch2)
3344 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003345 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003346
3347 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003348 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003349#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350}
3351
3352PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3353{
3354 if (!PyUnicode_Check(unicode)) {
3355 PyErr_BadArgument();
3356 return NULL;
3357 }
3358 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 PyUnicode_GET_SIZE(unicode),
3360 NULL,
3361 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362}
3363
3364/* --- Unicode Escape Codec ----------------------------------------------- */
3365
Fredrik Lundh06d12682001-01-24 07:59:11 +00003366static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 Py_ssize_t size,
3370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003373 Py_ssize_t startinpos;
3374 Py_ssize_t endinpos;
3375 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003380 char* message;
3381 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 PyObject *errorHandler = NULL;
3383 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 /* Escaped strings will always be longer than the resulting
3386 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 length after conversion to the true value.
3388 (but if the error callback returns a long replacement string
3389 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 v = _PyUnicode_New(size);
3391 if (v == NULL)
3392 goto onError;
3393 if (size == 0)
3394 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003398
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 while (s < end) {
3400 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003401 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403
3404 /* Non-escape characters are interpreted as Unicode ordinals */
3405 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003406 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 continue;
3408 }
3409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 /* \ - Escapes */
3412 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003413 c = *s++;
3414 if (s > end)
3415 c = '\0'; /* Invalid after \ */
3416 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 case '\n': break;
3420 case '\\': *p++ = '\\'; break;
3421 case '\'': *p++ = '\''; break;
3422 case '\"': *p++ = '\"'; break;
3423 case 'b': *p++ = '\b'; break;
3424 case 'f': *p++ = '\014'; break; /* FF */
3425 case 't': *p++ = '\t'; break;
3426 case 'n': *p++ = '\n'; break;
3427 case 'r': *p++ = '\r'; break;
3428 case 'v': *p++ = '\013'; break; /* VT */
3429 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3430
Benjamin Peterson29060642009-01-31 22:14:21 +00003431 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 case '0': case '1': case '2': case '3':
3433 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003434 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003435 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003436 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003437 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003438 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003440 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 break;
3442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 /* hex escapes */
3444 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003446 digits = 2;
3447 message = "truncated \\xXX escape";
3448 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003452 digits = 4;
3453 message = "truncated \\uXXXX escape";
3454 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003457 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003458 digits = 8;
3459 message = "truncated \\UXXXXXXXX escape";
3460 hexescape:
3461 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 outpos = p-PyUnicode_AS_UNICODE(v);
3463 if (s+digits>end) {
3464 endinpos = size;
3465 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 errors, &errorHandler,
3467 "unicodeescape", "end of string in escape sequence",
3468 &starts, &end, &startinpos, &endinpos, &exc, &s,
3469 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 goto onError;
3471 goto nextByte;
3472 }
3473 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003474 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003475 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 endinpos = (s+i+1)-starts;
3477 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 errors, &errorHandler,
3479 "unicodeescape", message,
3480 &starts, &end, &startinpos, &endinpos, &exc, &s,
3481 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003482 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003484 }
3485 chr = (chr<<4) & ~0xF;
3486 if (c >= '0' && c <= '9')
3487 chr += c - '0';
3488 else if (c >= 'a' && c <= 'f')
3489 chr += 10 + c - 'a';
3490 else
3491 chr += 10 + c - 'A';
3492 }
3493 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003494 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 /* _decoding_error will have already written into the
3496 target buffer. */
3497 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003498 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003499 /* when we get here, chr is a 32-bit unicode character */
3500 if (chr <= 0xffff)
3501 /* UCS-2 character */
3502 *p++ = (Py_UNICODE) chr;
3503 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003504 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003505 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003506#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003507 *p++ = chr;
3508#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003509 chr -= 0x10000L;
3510 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003511 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003512#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003513 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 endinpos = s-starts;
3515 outpos = p-PyUnicode_AS_UNICODE(v);
3516 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 errors, &errorHandler,
3518 "unicodeescape", "illegal Unicode character",
3519 &starts, &end, &startinpos, &endinpos, &exc, &s,
3520 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003521 goto onError;
3522 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003523 break;
3524
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003526 case 'N':
3527 message = "malformed \\N character escape";
3528 if (ucnhash_CAPI == NULL) {
3529 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003530 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003531 if (ucnhash_CAPI == NULL)
3532 goto ucnhashError;
3533 }
3534 if (*s == '{') {
3535 const char *start = s+1;
3536 /* look for the closing brace */
3537 while (*s != '}' && s < end)
3538 s++;
3539 if (s > start && s < end && *s == '}') {
3540 /* found a name. look it up in the unicode database */
3541 message = "unknown Unicode character name";
3542 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003543 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003544 goto store;
3545 }
3546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 endinpos = s-starts;
3548 outpos = p-PyUnicode_AS_UNICODE(v);
3549 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 errors, &errorHandler,
3551 "unicodeescape", message,
3552 &starts, &end, &startinpos, &endinpos, &exc, &s,
3553 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003554 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003555 break;
3556
3557 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003558 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 message = "\\ at end of string";
3560 s--;
3561 endinpos = s-starts;
3562 outpos = p-PyUnicode_AS_UNICODE(v);
3563 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 errors, &errorHandler,
3565 "unicodeescape", message,
3566 &starts, &end, &startinpos, &endinpos, &exc, &s,
3567 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003568 goto onError;
3569 }
3570 else {
3571 *p++ = '\\';
3572 *p++ = (unsigned char)s[-1];
3573 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003574 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003576 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003579 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003581 Py_XDECREF(errorHandler);
3582 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003584
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003586 PyErr_SetString(
3587 PyExc_UnicodeError,
3588 "\\N escapes not supported (can't load unicodedata module)"
3589 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003590 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 Py_XDECREF(errorHandler);
3592 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003593 return NULL;
3594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 return NULL;
3600}
3601
3602/* Return a Unicode-Escape string version of the Unicode object.
3603
3604 If quotes is true, the string is enclosed in u"" or u'' quotes as
3605 appropriate.
3606
3607*/
3608
Thomas Wouters477c8d52006-05-27 19:21:47 +00003609Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 Py_ssize_t size,
3611 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003612{
3613 /* like wcschr, but doesn't stop at NULL characters */
3614
3615 while (size-- > 0) {
3616 if (*s == ch)
3617 return s;
3618 s++;
3619 }
3620
3621 return NULL;
3622}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003623
Walter Dörwald79e913e2007-05-12 11:08:06 +00003624static const char *hexdigits = "0123456789abcdef";
3625
3626PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003629 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003632#ifdef Py_UNICODE_WIDE
3633 const Py_ssize_t expandsize = 10;
3634#else
3635 const Py_ssize_t expandsize = 6;
3636#endif
3637
Thomas Wouters89f507f2006-12-13 04:49:30 +00003638 /* XXX(nnorwitz): rather than over-allocating, it would be
3639 better to choose a different scheme. Perhaps scan the
3640 first N-chars of the string and allocate based on that size.
3641 */
3642 /* Initial allocation is based on the longest-possible unichr
3643 escape.
3644
3645 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3646 unichr, so in this case it's the longest unichr escape. In
3647 narrow (UTF-16) builds this is five chars per source unichr
3648 since there are two unichrs in the surrogate pair, so in narrow
3649 (UTF-16) builds it's not the longest unichr escape.
3650
3651 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3652 so in the narrow (UTF-16) build case it's the longest unichr
3653 escape.
3654 */
3655
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003656 if (size == 0)
3657 return PyBytes_FromStringAndSize(NULL, 0);
3658
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003659 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003661
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003662 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 2
3664 + expandsize*size
3665 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (repr == NULL)
3667 return NULL;
3668
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003669 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 while (size-- > 0) {
3672 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003673
Walter Dörwald79e913e2007-05-12 11:08:06 +00003674 /* Escape backslashes */
3675 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 *p++ = '\\';
3677 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003678 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003679 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003680
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003681#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003682 /* Map 21-bit characters to '\U00xxxxxx' */
3683 else if (ch >= 0x10000) {
3684 *p++ = '\\';
3685 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003686 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3687 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3688 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3689 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3690 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3691 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3692 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3693 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003695 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003696#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3698 else if (ch >= 0xD800 && ch < 0xDC00) {
3699 Py_UNICODE ch2;
3700 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003701
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 ch2 = *s++;
3703 size--;
3704 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3705 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3706 *p++ = '\\';
3707 *p++ = 'U';
3708 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3709 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3710 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3711 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3712 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3713 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3714 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3715 *p++ = hexdigits[ucs & 0x0000000F];
3716 continue;
3717 }
3718 /* Fall through: isolated surrogates are copied as-is */
3719 s--;
3720 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003721 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003722#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003725 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 *p++ = '\\';
3727 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003728 *p++ = hexdigits[(ch >> 12) & 0x000F];
3729 *p++ = hexdigits[(ch >> 8) & 0x000F];
3730 *p++ = hexdigits[(ch >> 4) & 0x000F];
3731 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003733
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003734 /* Map special whitespace to '\t', \n', '\r' */
3735 else if (ch == '\t') {
3736 *p++ = '\\';
3737 *p++ = 't';
3738 }
3739 else if (ch == '\n') {
3740 *p++ = '\\';
3741 *p++ = 'n';
3742 }
3743 else if (ch == '\r') {
3744 *p++ = '\\';
3745 *p++ = 'r';
3746 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003747
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003748 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003749 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003751 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003752 *p++ = hexdigits[(ch >> 4) & 0x000F];
3753 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 /* Copy everything else as-is */
3757 else
3758 *p++ = (char) ch;
3759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003761 assert(p - PyBytes_AS_STRING(repr) > 0);
3762 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3763 return NULL;
3764 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765}
3766
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003767PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003769 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 if (!PyUnicode_Check(unicode)) {
3771 PyErr_BadArgument();
3772 return NULL;
3773 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003774 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3775 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003776 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777}
3778
3779/* --- Raw Unicode Escape Codec ------------------------------------------- */
3780
3781PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 Py_ssize_t size,
3783 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 Py_ssize_t startinpos;
3787 Py_ssize_t endinpos;
3788 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 const char *end;
3792 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 PyObject *errorHandler = NULL;
3794 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003795
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 /* Escaped strings will always be longer than the resulting
3797 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 length after conversion to the true value. (But decoding error
3799 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 v = _PyUnicode_New(size);
3801 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003802 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 end = s + size;
3807 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 unsigned char c;
3809 Py_UCS4 x;
3810 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003811 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812
Benjamin Peterson29060642009-01-31 22:14:21 +00003813 /* Non-escape characters are interpreted as Unicode ordinals */
3814 if (*s != '\\') {
3815 *p++ = (unsigned char)*s++;
3816 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003817 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 startinpos = s-starts;
3819
3820 /* \u-escapes are only interpreted iff the number of leading
3821 backslashes if odd */
3822 bs = s;
3823 for (;s < end;) {
3824 if (*s != '\\')
3825 break;
3826 *p++ = (unsigned char)*s++;
3827 }
3828 if (((s - bs) & 1) == 0 ||
3829 s >= end ||
3830 (*s != 'u' && *s != 'U')) {
3831 continue;
3832 }
3833 p--;
3834 count = *s=='u' ? 4 : 8;
3835 s++;
3836
3837 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3838 outpos = p-PyUnicode_AS_UNICODE(v);
3839 for (x = 0, i = 0; i < count; ++i, ++s) {
3840 c = (unsigned char)*s;
3841 if (!ISXDIGIT(c)) {
3842 endinpos = s-starts;
3843 if (unicode_decode_call_errorhandler(
3844 errors, &errorHandler,
3845 "rawunicodeescape", "truncated \\uXXXX",
3846 &starts, &end, &startinpos, &endinpos, &exc, &s,
3847 &v, &outpos, &p))
3848 goto onError;
3849 goto nextByte;
3850 }
3851 x = (x<<4) & ~0xF;
3852 if (c >= '0' && c <= '9')
3853 x += c - '0';
3854 else if (c >= 'a' && c <= 'f')
3855 x += 10 + c - 'a';
3856 else
3857 x += 10 + c - 'A';
3858 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003859 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 /* UCS-2 character */
3861 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003862 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* UCS-4 character. Either store directly, or as
3864 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003865#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003867#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 x -= 0x10000L;
3869 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3870 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003871#endif
3872 } else {
3873 endinpos = s-starts;
3874 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003875 if (unicode_decode_call_errorhandler(
3876 errors, &errorHandler,
3877 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 &starts, &end, &startinpos, &endinpos, &exc, &s,
3879 &v, &outpos, &p))
3880 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003881 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 nextByte:
3883 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003885 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 Py_XDECREF(errorHandler);
3888 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003890
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 Py_XDECREF(errorHandler);
3894 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 return NULL;
3896}
3897
3898PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003901 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 char *p;
3903 char *q;
3904
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003905#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003906 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003907#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003908 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003909#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003910
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003911 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003913
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003914 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 if (repr == NULL)
3916 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003917 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003918 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003920 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 while (size-- > 0) {
3922 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003923#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 /* Map 32-bit characters to '\Uxxxxxxxx' */
3925 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003926 *p++ = '\\';
3927 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003928 *p++ = hexdigits[(ch >> 28) & 0xf];
3929 *p++ = hexdigits[(ch >> 24) & 0xf];
3930 *p++ = hexdigits[(ch >> 20) & 0xf];
3931 *p++ = hexdigits[(ch >> 16) & 0xf];
3932 *p++ = hexdigits[(ch >> 12) & 0xf];
3933 *p++ = hexdigits[(ch >> 8) & 0xf];
3934 *p++ = hexdigits[(ch >> 4) & 0xf];
3935 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003936 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003937 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003938#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3940 if (ch >= 0xD800 && ch < 0xDC00) {
3941 Py_UNICODE ch2;
3942 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003943
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 ch2 = *s++;
3945 size--;
3946 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3947 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3948 *p++ = '\\';
3949 *p++ = 'U';
3950 *p++ = hexdigits[(ucs >> 28) & 0xf];
3951 *p++ = hexdigits[(ucs >> 24) & 0xf];
3952 *p++ = hexdigits[(ucs >> 20) & 0xf];
3953 *p++ = hexdigits[(ucs >> 16) & 0xf];
3954 *p++ = hexdigits[(ucs >> 12) & 0xf];
3955 *p++ = hexdigits[(ucs >> 8) & 0xf];
3956 *p++ = hexdigits[(ucs >> 4) & 0xf];
3957 *p++ = hexdigits[ucs & 0xf];
3958 continue;
3959 }
3960 /* Fall through: isolated surrogates are copied as-is */
3961 s--;
3962 size++;
3963 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003964#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 /* Map 16-bit characters to '\uxxxx' */
3966 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 *p++ = '\\';
3968 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003969 *p++ = hexdigits[(ch >> 12) & 0xf];
3970 *p++ = hexdigits[(ch >> 8) & 0xf];
3971 *p++ = hexdigits[(ch >> 4) & 0xf];
3972 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 /* Copy everything else as-is */
3975 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 *p++ = (char) ch;
3977 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003978 size = p - q;
3979
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003980 assert(size > 0);
3981 if (_PyBytes_Resize(&repr, size) < 0)
3982 return NULL;
3983 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984}
3985
3986PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3987{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003988 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003990 PyErr_BadArgument();
3991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003993 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3994 PyUnicode_GET_SIZE(unicode));
3995
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003996 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997}
3998
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003999/* --- Unicode Internal Codec ------------------------------------------- */
4000
4001PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 Py_ssize_t size,
4003 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004004{
4005 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004006 Py_ssize_t startinpos;
4007 Py_ssize_t endinpos;
4008 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004009 PyUnicodeObject *v;
4010 Py_UNICODE *p;
4011 const char *end;
4012 const char *reason;
4013 PyObject *errorHandler = NULL;
4014 PyObject *exc = NULL;
4015
Neal Norwitzd43069c2006-01-08 01:12:10 +00004016#ifdef Py_UNICODE_WIDE
4017 Py_UNICODE unimax = PyUnicode_GetMax();
4018#endif
4019
Thomas Wouters89f507f2006-12-13 04:49:30 +00004020 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004021 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4022 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004024 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004026 p = PyUnicode_AS_UNICODE(v);
4027 end = s + size;
4028
4029 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004030 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004031 /* We have to sanity check the raw data, otherwise doom looms for
4032 some malformed UCS-4 data. */
4033 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004034#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004035 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004036#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004037 end-s < Py_UNICODE_SIZE
4038 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004040 startinpos = s - starts;
4041 if (end-s < Py_UNICODE_SIZE) {
4042 endinpos = end-starts;
4043 reason = "truncated input";
4044 }
4045 else {
4046 endinpos = s - starts + Py_UNICODE_SIZE;
4047 reason = "illegal code point (> 0x10FFFF)";
4048 }
4049 outpos = p - PyUnicode_AS_UNICODE(v);
4050 if (unicode_decode_call_errorhandler(
4051 errors, &errorHandler,
4052 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004054 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004055 goto onError;
4056 }
4057 }
4058 else {
4059 p++;
4060 s += Py_UNICODE_SIZE;
4061 }
4062 }
4063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004064 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004065 goto onError;
4066 Py_XDECREF(errorHandler);
4067 Py_XDECREF(exc);
4068 return (PyObject *)v;
4069
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004071 Py_XDECREF(v);
4072 Py_XDECREF(errorHandler);
4073 Py_XDECREF(exc);
4074 return NULL;
4075}
4076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077/* --- Latin-1 Codec ------------------------------------------------------ */
4078
4079PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 Py_ssize_t size,
4081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082{
4083 PyUnicodeObject *v;
4084 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004085 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004086
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004088 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 Py_UNICODE r = *(unsigned char*)s;
4090 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004091 }
4092
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 v = _PyUnicode_New(size);
4094 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004099 e = s + size;
4100 /* Unrolling the copy makes it much faster by reducing the looping
4101 overhead. This is similar to what many memcpy() implementations do. */
4102 unrolled_end = e - 4;
4103 while (s < unrolled_end) {
4104 p[0] = (unsigned char) s[0];
4105 p[1] = (unsigned char) s[1];
4106 p[2] = (unsigned char) s[2];
4107 p[3] = (unsigned char) s[3];
4108 s += 4;
4109 p += 4;
4110 }
4111 while (s < e)
4112 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004114
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 Py_XDECREF(v);
4117 return NULL;
4118}
4119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120/* create or adjust a UnicodeEncodeError */
4121static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 const char *encoding,
4123 const Py_UNICODE *unicode, Py_ssize_t size,
4124 Py_ssize_t startpos, Py_ssize_t endpos,
4125 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 *exceptionObject = PyUnicodeEncodeError_Create(
4129 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 }
4131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4133 goto onError;
4134 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4135 goto onError;
4136 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4137 goto onError;
4138 return;
4139 onError:
4140 Py_DECREF(*exceptionObject);
4141 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 }
4143}
4144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145/* raises a UnicodeEncodeError */
4146static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 const char *encoding,
4148 const Py_UNICODE *unicode, Py_ssize_t size,
4149 Py_ssize_t startpos, Py_ssize_t endpos,
4150 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151{
4152 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156}
4157
4158/* error handling callback helper:
4159 build arguments, call the callback and check the arguments,
4160 put the result into newpos and return the replacement string, which
4161 has to be freed by the caller */
4162static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyObject **errorHandler,
4164 const char *encoding, const char *reason,
4165 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4166 Py_ssize_t startpos, Py_ssize_t endpos,
4167 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004169 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170
4171 PyObject *restuple;
4172 PyObject *resunicode;
4173
4174 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 }
4179
4180 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184
4185 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004190 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 Py_DECREF(restuple);
4192 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004194 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 &resunicode, newpos)) {
4196 Py_DECREF(restuple);
4197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004199 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4200 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4201 Py_DECREF(restuple);
4202 return NULL;
4203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004206 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4208 Py_DECREF(restuple);
4209 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 Py_INCREF(resunicode);
4212 Py_DECREF(restuple);
4213 return resunicode;
4214}
4215
4216static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 Py_ssize_t size,
4218 const char *errors,
4219 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220{
4221 /* output object */
4222 PyObject *res;
4223 /* pointers to the beginning and end+1 of input */
4224 const Py_UNICODE *startp = p;
4225 const Py_UNICODE *endp = p + size;
4226 /* pointer to the beginning of the unencodable characters */
4227 /* const Py_UNICODE *badp = NULL; */
4228 /* pointer into the output */
4229 char *str;
4230 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004231 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004232 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4233 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 PyObject *errorHandler = NULL;
4235 PyObject *exc = NULL;
4236 /* the following variable is used for caching string comparisons
4237 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4238 int known_errorHandler = -1;
4239
4240 /* allocate enough for a simple encoding without
4241 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004242 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004243 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004244 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004246 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004247 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 ressize = size;
4249
4250 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 /* can we encode this? */
4254 if (c<limit) {
4255 /* no overflow check, because we know that the space is enough */
4256 *str++ = (char)c;
4257 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004258 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 else {
4260 Py_ssize_t unicodepos = p-startp;
4261 Py_ssize_t requiredsize;
4262 PyObject *repunicode;
4263 Py_ssize_t repsize;
4264 Py_ssize_t newpos;
4265 Py_ssize_t respos;
4266 Py_UNICODE *uni2;
4267 /* startpos for collecting unencodable chars */
4268 const Py_UNICODE *collstart = p;
4269 const Py_UNICODE *collend = p;
4270 /* find all unecodable characters */
4271 while ((collend < endp) && ((*collend)>=limit))
4272 ++collend;
4273 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4274 if (known_errorHandler==-1) {
4275 if ((errors==NULL) || (!strcmp(errors, "strict")))
4276 known_errorHandler = 1;
4277 else if (!strcmp(errors, "replace"))
4278 known_errorHandler = 2;
4279 else if (!strcmp(errors, "ignore"))
4280 known_errorHandler = 3;
4281 else if (!strcmp(errors, "xmlcharrefreplace"))
4282 known_errorHandler = 4;
4283 else
4284 known_errorHandler = 0;
4285 }
4286 switch (known_errorHandler) {
4287 case 1: /* strict */
4288 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4289 goto onError;
4290 case 2: /* replace */
4291 while (collstart++<collend)
4292 *str++ = '?'; /* fall through */
4293 case 3: /* ignore */
4294 p = collend;
4295 break;
4296 case 4: /* xmlcharrefreplace */
4297 respos = str - PyBytes_AS_STRING(res);
4298 /* determine replacement size (temporarily (mis)uses p) */
4299 for (p = collstart, repsize = 0; p < collend; ++p) {
4300 if (*p<10)
4301 repsize += 2+1+1;
4302 else if (*p<100)
4303 repsize += 2+2+1;
4304 else if (*p<1000)
4305 repsize += 2+3+1;
4306 else if (*p<10000)
4307 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004308#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 else
4310 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004311#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 else if (*p<100000)
4313 repsize += 2+5+1;
4314 else if (*p<1000000)
4315 repsize += 2+6+1;
4316 else
4317 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004318#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004319 }
4320 requiredsize = respos+repsize+(endp-collend);
4321 if (requiredsize > ressize) {
4322 if (requiredsize<2*ressize)
4323 requiredsize = 2*ressize;
4324 if (_PyBytes_Resize(&res, requiredsize))
4325 goto onError;
4326 str = PyBytes_AS_STRING(res) + respos;
4327 ressize = requiredsize;
4328 }
4329 /* generate replacement (temporarily (mis)uses p) */
4330 for (p = collstart; p < collend; ++p) {
4331 str += sprintf(str, "&#%d;", (int)*p);
4332 }
4333 p = collend;
4334 break;
4335 default:
4336 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4337 encoding, reason, startp, size, &exc,
4338 collstart-startp, collend-startp, &newpos);
4339 if (repunicode == NULL)
4340 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004341 if (PyBytes_Check(repunicode)) {
4342 /* Directly copy bytes result to output. */
4343 repsize = PyBytes_Size(repunicode);
4344 if (repsize > 1) {
4345 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004346 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004347 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4348 Py_DECREF(repunicode);
4349 goto onError;
4350 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004351 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004352 ressize += repsize-1;
4353 }
4354 memcpy(str, PyBytes_AsString(repunicode), repsize);
4355 str += repsize;
4356 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004357 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004358 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 /* need more space? (at least enough for what we
4361 have+the replacement+the rest of the string, so
4362 we won't have to check space for encodable characters) */
4363 respos = str - PyBytes_AS_STRING(res);
4364 repsize = PyUnicode_GET_SIZE(repunicode);
4365 requiredsize = respos+repsize+(endp-collend);
4366 if (requiredsize > ressize) {
4367 if (requiredsize<2*ressize)
4368 requiredsize = 2*ressize;
4369 if (_PyBytes_Resize(&res, requiredsize)) {
4370 Py_DECREF(repunicode);
4371 goto onError;
4372 }
4373 str = PyBytes_AS_STRING(res) + respos;
4374 ressize = requiredsize;
4375 }
4376 /* check if there is anything unencodable in the replacement
4377 and copy it to the output */
4378 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4379 c = *uni2;
4380 if (c >= limit) {
4381 raise_encode_exception(&exc, encoding, startp, size,
4382 unicodepos, unicodepos+1, reason);
4383 Py_DECREF(repunicode);
4384 goto onError;
4385 }
4386 *str = (char)c;
4387 }
4388 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004389 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004391 }
4392 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004393 /* Resize if we allocated to much */
4394 size = str - PyBytes_AS_STRING(res);
4395 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004396 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004397 if (_PyBytes_Resize(&res, size) < 0)
4398 goto onError;
4399 }
4400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 Py_XDECREF(errorHandler);
4402 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004403 return res;
4404
4405 onError:
4406 Py_XDECREF(res);
4407 Py_XDECREF(errorHandler);
4408 Py_XDECREF(exc);
4409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410}
4411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 Py_ssize_t size,
4414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417}
4418
4419PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4420{
4421 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 PyErr_BadArgument();
4423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 }
4425 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 PyUnicode_GET_SIZE(unicode),
4427 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428}
4429
4430/* --- 7-bit ASCII Codec -------------------------------------------------- */
4431
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_ssize_t size,
4434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 PyUnicodeObject *v;
4438 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 Py_ssize_t startinpos;
4440 Py_ssize_t endinpos;
4441 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 const char *e;
4443 PyObject *errorHandler = NULL;
4444 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004445
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004447 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 Py_UNICODE r = *(unsigned char*)s;
4449 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004450 }
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 v = _PyUnicode_New(size);
4453 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 e = s + size;
4459 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 register unsigned char c = (unsigned char)*s;
4461 if (c < 128) {
4462 *p++ = c;
4463 ++s;
4464 }
4465 else {
4466 startinpos = s-starts;
4467 endinpos = startinpos + 1;
4468 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4469 if (unicode_decode_call_errorhandler(
4470 errors, &errorHandler,
4471 "ascii", "ordinal not in range(128)",
4472 &starts, &e, &startinpos, &endinpos, &exc, &s,
4473 &v, &outpos, &p))
4474 goto onError;
4475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004477 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4479 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 Py_XDECREF(errorHandler);
4481 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004483
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 return NULL;
4489}
4490
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 Py_ssize_t size,
4493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496}
4497
4498PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4499{
4500 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 PyErr_BadArgument();
4502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 }
4504 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 PyUnicode_GET_SIZE(unicode),
4506 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507}
4508
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004510
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004511/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004512
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004513#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004514#define NEED_RETRY
4515#endif
4516
4517/* XXX This code is limited to "true" double-byte encodings, as
4518 a) it assumes an incomplete character consists of a single byte, and
4519 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004521
4522static int is_dbcs_lead_byte(const char *s, int offset)
4523{
4524 const char *curr = s + offset;
4525
4526 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 const char *prev = CharPrev(s, curr);
4528 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004529 }
4530 return 0;
4531}
4532
4533/*
4534 * Decode MBCS string into unicode object. If 'final' is set, converts
4535 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4536 */
4537static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 const char *s, /* MBCS string */
4539 int size, /* sizeof MBCS string */
4540 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004541{
4542 Py_UNICODE *p;
4543 Py_ssize_t n = 0;
4544 int usize = 0;
4545
4546 assert(size >= 0);
4547
4548 /* Skip trailing lead-byte unless 'final' is set */
4549 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004551
4552 /* First get the size of the result */
4553 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4555 if (usize == 0) {
4556 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4557 return -1;
4558 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004559 }
4560
4561 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 /* Create unicode object */
4563 *v = _PyUnicode_New(usize);
4564 if (*v == NULL)
4565 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566 }
4567 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 /* Extend unicode object */
4569 n = PyUnicode_GET_SIZE(*v);
4570 if (_PyUnicode_Resize(v, n + usize) < 0)
4571 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572 }
4573
4574 /* Do the conversion */
4575 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 p = PyUnicode_AS_UNICODE(*v) + n;
4577 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4578 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4579 return -1;
4580 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581 }
4582
4583 return size;
4584}
4585
4586PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_ssize_t size,
4588 const char *errors,
4589 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004590{
4591 PyUnicodeObject *v = NULL;
4592 int done;
4593
4594 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004596
4597#ifdef NEED_RETRY
4598 retry:
4599 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004601 else
4602#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004604
4605 if (done < 0) {
4606 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004608 }
4609
4610 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612
4613#ifdef NEED_RETRY
4614 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 s += done;
4616 size -= done;
4617 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004618 }
4619#endif
4620
4621 return (PyObject *)v;
4622}
4623
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004624PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 Py_ssize_t size,
4626 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004627{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004628 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4629}
4630
4631/*
4632 * Convert unicode into string object (MBCS).
4633 * Returns 0 if succeed, -1 otherwise.
4634 */
4635static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 const Py_UNICODE *p, /* unicode */
4637 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638{
4639 int mbcssize = 0;
4640 Py_ssize_t n = 0;
4641
4642 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004643
4644 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004645 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4647 if (mbcssize == 0) {
4648 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4649 return -1;
4650 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004651 }
4652
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 /* Create string object */
4655 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4656 if (*repr == NULL)
4657 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658 }
4659 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 /* Extend string object */
4661 n = PyBytes_Size(*repr);
4662 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4663 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664 }
4665
4666 /* Do the conversion */
4667 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 char *s = PyBytes_AS_STRING(*repr) + n;
4669 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4670 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4671 return -1;
4672 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004673 }
4674
4675 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004676}
4677
4678PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 Py_ssize_t size,
4680 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004681{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004682 PyObject *repr = NULL;
4683 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004684
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004685#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004689 else
4690#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004693 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 Py_XDECREF(repr);
4695 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004696 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697
4698#ifdef NEED_RETRY
4699 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 p += INT_MAX;
4701 size -= INT_MAX;
4702 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004703 }
4704#endif
4705
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004706 return repr;
4707}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004708
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004709PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4710{
4711 if (!PyUnicode_Check(unicode)) {
4712 PyErr_BadArgument();
4713 return NULL;
4714 }
4715 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 PyUnicode_GET_SIZE(unicode),
4717 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004718}
4719
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004720#undef NEED_RETRY
4721
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004722#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004723
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724/* --- Character Mapping Codec -------------------------------------------- */
4725
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 Py_ssize_t size,
4728 PyObject *mapping,
4729 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004732 Py_ssize_t startinpos;
4733 Py_ssize_t endinpos;
4734 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 PyUnicodeObject *v;
4737 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004738 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 PyObject *errorHandler = NULL;
4740 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004741 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004742 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 /* Default to Latin-1 */
4745 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748 v = _PyUnicode_New(size);
4749 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004755 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 mapstring = PyUnicode_AS_UNICODE(mapping);
4757 maplen = PyUnicode_GET_SIZE(mapping);
4758 while (s < e) {
4759 unsigned char ch = *s;
4760 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 if (ch < maplen)
4763 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 if (x == 0xfffe) {
4766 /* undefined mapping */
4767 outpos = p-PyUnicode_AS_UNICODE(v);
4768 startinpos = s-starts;
4769 endinpos = startinpos+1;
4770 if (unicode_decode_call_errorhandler(
4771 errors, &errorHandler,
4772 "charmap", "character maps to <undefined>",
4773 &starts, &e, &startinpos, &endinpos, &exc, &s,
4774 &v, &outpos, &p)) {
4775 goto onError;
4776 }
4777 continue;
4778 }
4779 *p++ = x;
4780 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004781 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004782 }
4783 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 while (s < e) {
4785 unsigned char ch = *s;
4786 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004787
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4789 w = PyLong_FromLong((long)ch);
4790 if (w == NULL)
4791 goto onError;
4792 x = PyObject_GetItem(mapping, w);
4793 Py_DECREF(w);
4794 if (x == NULL) {
4795 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4796 /* No mapping found means: mapping is undefined. */
4797 PyErr_Clear();
4798 x = Py_None;
4799 Py_INCREF(x);
4800 } else
4801 goto onError;
4802 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004803
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 /* Apply mapping */
4805 if (PyLong_Check(x)) {
4806 long value = PyLong_AS_LONG(x);
4807 if (value < 0 || value > 65535) {
4808 PyErr_SetString(PyExc_TypeError,
4809 "character mapping must be in range(65536)");
4810 Py_DECREF(x);
4811 goto onError;
4812 }
4813 *p++ = (Py_UNICODE)value;
4814 }
4815 else if (x == Py_None) {
4816 /* undefined mapping */
4817 outpos = p-PyUnicode_AS_UNICODE(v);
4818 startinpos = s-starts;
4819 endinpos = startinpos+1;
4820 if (unicode_decode_call_errorhandler(
4821 errors, &errorHandler,
4822 "charmap", "character maps to <undefined>",
4823 &starts, &e, &startinpos, &endinpos, &exc, &s,
4824 &v, &outpos, &p)) {
4825 Py_DECREF(x);
4826 goto onError;
4827 }
4828 Py_DECREF(x);
4829 continue;
4830 }
4831 else if (PyUnicode_Check(x)) {
4832 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004833
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 if (targetsize == 1)
4835 /* 1-1 mapping */
4836 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004837
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 else if (targetsize > 1) {
4839 /* 1-n mapping */
4840 if (targetsize > extrachars) {
4841 /* resize first */
4842 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4843 Py_ssize_t needed = (targetsize - extrachars) + \
4844 (targetsize << 2);
4845 extrachars += needed;
4846 /* XXX overflow detection missing */
4847 if (_PyUnicode_Resize(&v,
4848 PyUnicode_GET_SIZE(v) + needed) < 0) {
4849 Py_DECREF(x);
4850 goto onError;
4851 }
4852 p = PyUnicode_AS_UNICODE(v) + oldpos;
4853 }
4854 Py_UNICODE_COPY(p,
4855 PyUnicode_AS_UNICODE(x),
4856 targetsize);
4857 p += targetsize;
4858 extrachars -= targetsize;
4859 }
4860 /* 1-0 mapping: skip the character */
4861 }
4862 else {
4863 /* wrong return value */
4864 PyErr_SetString(PyExc_TypeError,
4865 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004866 Py_DECREF(x);
4867 goto onError;
4868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 Py_DECREF(x);
4870 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 }
4873 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4875 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 Py_XDECREF(errorHandler);
4877 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 Py_XDECREF(errorHandler);
4882 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 Py_XDECREF(v);
4884 return NULL;
4885}
4886
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004887/* Charmap encoding: the lookup table */
4888
4889struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 PyObject_HEAD
4891 unsigned char level1[32];
4892 int count2, count3;
4893 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004894};
4895
4896static PyObject*
4897encoding_map_size(PyObject *obj, PyObject* args)
4898{
4899 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004900 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004902}
4903
4904static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004905 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 PyDoc_STR("Return the size (in bytes) of this object") },
4907 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004908};
4909
4910static void
4911encoding_map_dealloc(PyObject* o)
4912{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004913 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004914}
4915
4916static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004917 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 "EncodingMap", /*tp_name*/
4919 sizeof(struct encoding_map), /*tp_basicsize*/
4920 0, /*tp_itemsize*/
4921 /* methods */
4922 encoding_map_dealloc, /*tp_dealloc*/
4923 0, /*tp_print*/
4924 0, /*tp_getattr*/
4925 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004926 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 0, /*tp_repr*/
4928 0, /*tp_as_number*/
4929 0, /*tp_as_sequence*/
4930 0, /*tp_as_mapping*/
4931 0, /*tp_hash*/
4932 0, /*tp_call*/
4933 0, /*tp_str*/
4934 0, /*tp_getattro*/
4935 0, /*tp_setattro*/
4936 0, /*tp_as_buffer*/
4937 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4938 0, /*tp_doc*/
4939 0, /*tp_traverse*/
4940 0, /*tp_clear*/
4941 0, /*tp_richcompare*/
4942 0, /*tp_weaklistoffset*/
4943 0, /*tp_iter*/
4944 0, /*tp_iternext*/
4945 encoding_map_methods, /*tp_methods*/
4946 0, /*tp_members*/
4947 0, /*tp_getset*/
4948 0, /*tp_base*/
4949 0, /*tp_dict*/
4950 0, /*tp_descr_get*/
4951 0, /*tp_descr_set*/
4952 0, /*tp_dictoffset*/
4953 0, /*tp_init*/
4954 0, /*tp_alloc*/
4955 0, /*tp_new*/
4956 0, /*tp_free*/
4957 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004958};
4959
4960PyObject*
4961PyUnicode_BuildEncodingMap(PyObject* string)
4962{
4963 Py_UNICODE *decode;
4964 PyObject *result;
4965 struct encoding_map *mresult;
4966 int i;
4967 int need_dict = 0;
4968 unsigned char level1[32];
4969 unsigned char level2[512];
4970 unsigned char *mlevel1, *mlevel2, *mlevel3;
4971 int count2 = 0, count3 = 0;
4972
4973 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4974 PyErr_BadArgument();
4975 return NULL;
4976 }
4977 decode = PyUnicode_AS_UNICODE(string);
4978 memset(level1, 0xFF, sizeof level1);
4979 memset(level2, 0xFF, sizeof level2);
4980
4981 /* If there isn't a one-to-one mapping of NULL to \0,
4982 or if there are non-BMP characters, we need to use
4983 a mapping dictionary. */
4984 if (decode[0] != 0)
4985 need_dict = 1;
4986 for (i = 1; i < 256; i++) {
4987 int l1, l2;
4988 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004989#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004990 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004991#endif
4992 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004993 need_dict = 1;
4994 break;
4995 }
4996 if (decode[i] == 0xFFFE)
4997 /* unmapped character */
4998 continue;
4999 l1 = decode[i] >> 11;
5000 l2 = decode[i] >> 7;
5001 if (level1[l1] == 0xFF)
5002 level1[l1] = count2++;
5003 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005004 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005005 }
5006
5007 if (count2 >= 0xFF || count3 >= 0xFF)
5008 need_dict = 1;
5009
5010 if (need_dict) {
5011 PyObject *result = PyDict_New();
5012 PyObject *key, *value;
5013 if (!result)
5014 return NULL;
5015 for (i = 0; i < 256; i++) {
5016 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005017 key = PyLong_FromLong(decode[i]);
5018 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005019 if (!key || !value)
5020 goto failed1;
5021 if (PyDict_SetItem(result, key, value) == -1)
5022 goto failed1;
5023 Py_DECREF(key);
5024 Py_DECREF(value);
5025 }
5026 return result;
5027 failed1:
5028 Py_XDECREF(key);
5029 Py_XDECREF(value);
5030 Py_DECREF(result);
5031 return NULL;
5032 }
5033
5034 /* Create a three-level trie */
5035 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5036 16*count2 + 128*count3 - 1);
5037 if (!result)
5038 return PyErr_NoMemory();
5039 PyObject_Init(result, &EncodingMapType);
5040 mresult = (struct encoding_map*)result;
5041 mresult->count2 = count2;
5042 mresult->count3 = count3;
5043 mlevel1 = mresult->level1;
5044 mlevel2 = mresult->level23;
5045 mlevel3 = mresult->level23 + 16*count2;
5046 memcpy(mlevel1, level1, 32);
5047 memset(mlevel2, 0xFF, 16*count2);
5048 memset(mlevel3, 0, 128*count3);
5049 count3 = 0;
5050 for (i = 1; i < 256; i++) {
5051 int o1, o2, o3, i2, i3;
5052 if (decode[i] == 0xFFFE)
5053 /* unmapped character */
5054 continue;
5055 o1 = decode[i]>>11;
5056 o2 = (decode[i]>>7) & 0xF;
5057 i2 = 16*mlevel1[o1] + o2;
5058 if (mlevel2[i2] == 0xFF)
5059 mlevel2[i2] = count3++;
5060 o3 = decode[i] & 0x7F;
5061 i3 = 128*mlevel2[i2] + o3;
5062 mlevel3[i3] = i;
5063 }
5064 return result;
5065}
5066
5067static int
5068encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5069{
5070 struct encoding_map *map = (struct encoding_map*)mapping;
5071 int l1 = c>>11;
5072 int l2 = (c>>7) & 0xF;
5073 int l3 = c & 0x7F;
5074 int i;
5075
5076#ifdef Py_UNICODE_WIDE
5077 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005079 }
5080#endif
5081 if (c == 0)
5082 return 0;
5083 /* level 1*/
5084 i = map->level1[l1];
5085 if (i == 0xFF) {
5086 return -1;
5087 }
5088 /* level 2*/
5089 i = map->level23[16*i+l2];
5090 if (i == 0xFF) {
5091 return -1;
5092 }
5093 /* level 3 */
5094 i = map->level23[16*map->count2 + 128*i + l3];
5095 if (i == 0) {
5096 return -1;
5097 }
5098 return i;
5099}
5100
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101/* Lookup the character ch in the mapping. If the character
5102 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005103 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105{
Christian Heimes217cfd12007-12-02 14:31:20 +00005106 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 PyObject *x;
5108
5109 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 x = PyObject_GetItem(mapping, w);
5112 Py_DECREF(w);
5113 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5115 /* No mapping found means: mapping is undefined. */
5116 PyErr_Clear();
5117 x = Py_None;
5118 Py_INCREF(x);
5119 return x;
5120 } else
5121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005123 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005125 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 long value = PyLong_AS_LONG(x);
5127 if (value < 0 || value > 255) {
5128 PyErr_SetString(PyExc_TypeError,
5129 "character mapping must be in range(256)");
5130 Py_DECREF(x);
5131 return NULL;
5132 }
5133 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005135 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 /* wrong return value */
5139 PyErr_Format(PyExc_TypeError,
5140 "character mapping must return integer, bytes or None, not %.400s",
5141 x->ob_type->tp_name);
5142 Py_DECREF(x);
5143 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 }
5145}
5146
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005147static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005148charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005149{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005150 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5151 /* exponentially overallocate to minimize reallocations */
5152 if (requiredsize < 2*outsize)
5153 requiredsize = 2*outsize;
5154 if (_PyBytes_Resize(outobj, requiredsize))
5155 return -1;
5156 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005157}
5158
Benjamin Peterson14339b62009-01-31 16:36:08 +00005159typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005163 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 space is available. Return a new reference to the object that
5165 was put in the output buffer, or Py_None, if the mapping was undefined
5166 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005167 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005169charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005172 PyObject *rep;
5173 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005174 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175
Christian Heimes90aa7642007-12-19 02:45:37 +00005176 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005177 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005179 if (res == -1)
5180 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 if (outsize<requiredsize)
5182 if (charmapencode_resize(outobj, outpos, requiredsize))
5183 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005184 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 outstart[(*outpos)++] = (char)res;
5186 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005187 }
5188
5189 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005192 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_DECREF(rep);
5194 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005195 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 if (PyLong_Check(rep)) {
5197 Py_ssize_t requiredsize = *outpos+1;
5198 if (outsize<requiredsize)
5199 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5200 Py_DECREF(rep);
5201 return enc_EXCEPTION;
5202 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005203 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005205 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 else {
5207 const char *repchars = PyBytes_AS_STRING(rep);
5208 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5209 Py_ssize_t requiredsize = *outpos+repsize;
5210 if (outsize<requiredsize)
5211 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5212 Py_DECREF(rep);
5213 return enc_EXCEPTION;
5214 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005215 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 memcpy(outstart + *outpos, repchars, repsize);
5217 *outpos += repsize;
5218 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005220 Py_DECREF(rep);
5221 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222}
5223
5224/* handle an error in PyUnicode_EncodeCharmap
5225 Return 0 on success, -1 on error */
5226static
5227int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005230 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005231 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232{
5233 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t repsize;
5235 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 Py_UNICODE *uni2;
5237 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t collstartpos = *inpos;
5239 Py_ssize_t collendpos = *inpos+1;
5240 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 char *encoding = "charmap";
5242 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005243 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005245 /* find all unencodable characters */
5246 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005247 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005248 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 int res = encoding_map_lookup(p[collendpos], mapping);
5250 if (res != -1)
5251 break;
5252 ++collendpos;
5253 continue;
5254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005255
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 rep = charmapencode_lookup(p[collendpos], mapping);
5257 if (rep==NULL)
5258 return -1;
5259 else if (rep!=Py_None) {
5260 Py_DECREF(rep);
5261 break;
5262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005263 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 }
5266 /* cache callback name lookup
5267 * (if not done yet, i.e. it's the first error) */
5268 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 if ((errors==NULL) || (!strcmp(errors, "strict")))
5270 *known_errorHandler = 1;
5271 else if (!strcmp(errors, "replace"))
5272 *known_errorHandler = 2;
5273 else if (!strcmp(errors, "ignore"))
5274 *known_errorHandler = 3;
5275 else if (!strcmp(errors, "xmlcharrefreplace"))
5276 *known_errorHandler = 4;
5277 else
5278 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 }
5280 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 case 1: /* strict */
5282 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5283 return -1;
5284 case 2: /* replace */
5285 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 x = charmapencode_output('?', mapping, res, respos);
5287 if (x==enc_EXCEPTION) {
5288 return -1;
5289 }
5290 else if (x==enc_FAILED) {
5291 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5292 return -1;
5293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005294 }
5295 /* fall through */
5296 case 3: /* ignore */
5297 *inpos = collendpos;
5298 break;
5299 case 4: /* xmlcharrefreplace */
5300 /* generate replacement (temporarily (mis)uses p) */
5301 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 char buffer[2+29+1+1];
5303 char *cp;
5304 sprintf(buffer, "&#%d;", (int)p[collpos]);
5305 for (cp = buffer; *cp; ++cp) {
5306 x = charmapencode_output(*cp, mapping, res, respos);
5307 if (x==enc_EXCEPTION)
5308 return -1;
5309 else if (x==enc_FAILED) {
5310 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5311 return -1;
5312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005313 }
5314 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005315 *inpos = collendpos;
5316 break;
5317 default:
5318 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 encoding, reason, p, size, exceptionObject,
5320 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005321 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005323 if (PyBytes_Check(repunicode)) {
5324 /* Directly copy bytes result to output. */
5325 Py_ssize_t outsize = PyBytes_Size(*res);
5326 Py_ssize_t requiredsize;
5327 repsize = PyBytes_Size(repunicode);
5328 requiredsize = *respos + repsize;
5329 if (requiredsize > outsize)
5330 /* Make room for all additional bytes. */
5331 if (charmapencode_resize(res, respos, requiredsize)) {
5332 Py_DECREF(repunicode);
5333 return -1;
5334 }
5335 memcpy(PyBytes_AsString(*res) + *respos,
5336 PyBytes_AsString(repunicode), repsize);
5337 *respos += repsize;
5338 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005339 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005340 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005342 /* generate replacement */
5343 repsize = PyUnicode_GET_SIZE(repunicode);
5344 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 x = charmapencode_output(*uni2, mapping, res, respos);
5346 if (x==enc_EXCEPTION) {
5347 return -1;
5348 }
5349 else if (x==enc_FAILED) {
5350 Py_DECREF(repunicode);
5351 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5352 return -1;
5353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005354 }
5355 *inpos = newpos;
5356 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 }
5358 return 0;
5359}
5360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 Py_ssize_t size,
5363 PyObject *mapping,
5364 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 /* output object */
5367 PyObject *res = NULL;
5368 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 PyObject *errorHandler = NULL;
5373 PyObject *exc = NULL;
5374 /* the following variable is used for caching string comparisons
5375 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5376 * 3=ignore, 4=xmlcharrefreplace */
5377 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
5379 /* Default to Latin-1 */
5380 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 /* allocate enough for a simple encoding without
5384 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005385 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 if (res == NULL)
5387 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005388 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 /* try to encode it */
5393 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5394 if (x==enc_EXCEPTION) /* error */
5395 goto onError;
5396 if (x==enc_FAILED) { /* unencodable character */
5397 if (charmap_encoding_error(p, size, &inpos, mapping,
5398 &exc,
5399 &known_errorHandler, &errorHandler, errors,
5400 &res, &respos)) {
5401 goto onError;
5402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005403 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 else
5405 /* done with this character => adjust input position */
5406 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005410 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005411 if (_PyBytes_Resize(&res, respos) < 0)
5412 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 Py_XDECREF(exc);
5415 Py_XDECREF(errorHandler);
5416 return res;
5417
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 Py_XDECREF(res);
5420 Py_XDECREF(exc);
5421 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 return NULL;
5423}
5424
5425PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427{
5428 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 PyErr_BadArgument();
5430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
5432 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 PyUnicode_GET_SIZE(unicode),
5434 mapping,
5435 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436}
5437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438/* create or adjust a UnicodeTranslateError */
5439static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 const Py_UNICODE *unicode, Py_ssize_t size,
5441 Py_ssize_t startpos, Py_ssize_t endpos,
5442 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005445 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 }
5448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5450 goto onError;
5451 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5452 goto onError;
5453 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5454 goto onError;
5455 return;
5456 onError:
5457 Py_DECREF(*exceptionObject);
5458 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 }
5460}
5461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462/* raises a UnicodeTranslateError */
5463static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 const Py_UNICODE *unicode, Py_ssize_t size,
5465 Py_ssize_t startpos, Py_ssize_t endpos,
5466 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005467{
5468 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472}
5473
5474/* error handling callback helper:
5475 build arguments, call the callback and check the arguments,
5476 put the result into newpos and return the replacement string, which
5477 has to be freed by the caller */
5478static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 PyObject **errorHandler,
5480 const char *reason,
5481 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5482 Py_ssize_t startpos, Py_ssize_t endpos,
5483 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005485 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005487 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 PyObject *restuple;
5489 PyObject *resunicode;
5490
5491 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 }
5496
5497 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501
5502 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005507 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 Py_DECREF(restuple);
5509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 }
5511 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 &resunicode, &i_newpos)) {
5513 Py_DECREF(restuple);
5514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005516 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 else
5519 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005520 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5522 Py_DECREF(restuple);
5523 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 Py_INCREF(resunicode);
5526 Py_DECREF(restuple);
5527 return resunicode;
5528}
5529
5530/* Lookup the character ch in the mapping and put the result in result,
5531 which must be decrefed by the caller.
5532 Return 0 on success, -1 on error */
5533static
5534int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5535{
Christian Heimes217cfd12007-12-02 14:31:20 +00005536 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 PyObject *x;
5538
5539 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 x = PyObject_GetItem(mapping, w);
5542 Py_DECREF(w);
5543 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5545 /* No mapping found means: use 1:1 mapping. */
5546 PyErr_Clear();
5547 *result = NULL;
5548 return 0;
5549 } else
5550 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 }
5552 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 *result = x;
5554 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005556 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 long value = PyLong_AS_LONG(x);
5558 long max = PyUnicode_GetMax();
5559 if (value < 0 || value > max) {
5560 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005561 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 Py_DECREF(x);
5563 return -1;
5564 }
5565 *result = x;
5566 return 0;
5567 }
5568 else if (PyUnicode_Check(x)) {
5569 *result = x;
5570 return 0;
5571 }
5572 else {
5573 /* wrong return value */
5574 PyErr_SetString(PyExc_TypeError,
5575 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005576 Py_DECREF(x);
5577 return -1;
5578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579}
5580/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 if not reallocate and adjust various state variables.
5582 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005583static
Walter Dörwald4894c302003-10-24 14:25:28 +00005584int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005588 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 /* remember old output position */
5590 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5591 /* exponentially overallocate to minimize reallocations */
5592 if (requiredsize < 2 * oldsize)
5593 requiredsize = 2 * oldsize;
5594 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5595 return -1;
5596 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 }
5598 return 0;
5599}
5600/* lookup the character, put the result in the output string and adjust
5601 various state variables. Return a new reference to the object that
5602 was put in the output buffer in *result, or Py_None, if the mapping was
5603 undefined (in which case no character was written).
5604 The called must decref result.
5605 Return 0 on success, -1 on error. */
5606static
Walter Dörwald4894c302003-10-24 14:25:28 +00005607int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5609 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610{
Walter Dörwald4894c302003-10-24 14:25:28 +00005611 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 /* not found => default to 1:1 mapping */
5615 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 }
5617 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005619 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* no overflow check, because we know that the space is enough */
5621 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 }
5623 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5625 if (repsize==1) {
5626 /* no overflow check, because we know that the space is enough */
5627 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5628 }
5629 else if (repsize!=0) {
5630 /* more than one character */
5631 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5632 (insize - (curinp-startinp)) +
5633 repsize - 1;
5634 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5635 return -1;
5636 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5637 *outp += repsize;
5638 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 }
5640 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 return 0;
5643}
5644
5645PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 Py_ssize_t size,
5647 PyObject *mapping,
5648 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 /* output object */
5651 PyObject *res = NULL;
5652 /* pointers to the beginning and end+1 of input */
5653 const Py_UNICODE *startp = p;
5654 const Py_UNICODE *endp = p + size;
5655 /* pointer into the output */
5656 Py_UNICODE *str;
5657 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 char *reason = "character maps to <undefined>";
5660 PyObject *errorHandler = NULL;
5661 PyObject *exc = NULL;
5662 /* the following variable is used for caching string comparisons
5663 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5664 * 3=ignore, 4=xmlcharrefreplace */
5665 int known_errorHandler = -1;
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 PyErr_BadArgument();
5669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671
5672 /* allocate enough for a simple 1:1 translation without
5673 replacements, if we need more, we'll resize */
5674 res = PyUnicode_FromUnicode(NULL, size);
5675 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* try to encode it */
5683 PyObject *x = NULL;
5684 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5685 Py_XDECREF(x);
5686 goto onError;
5687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005688 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 if (x!=Py_None) /* it worked => adjust input pointer */
5690 ++p;
5691 else { /* untranslatable character */
5692 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5693 Py_ssize_t repsize;
5694 Py_ssize_t newpos;
5695 Py_UNICODE *uni2;
5696 /* startpos for collecting untranslatable chars */
5697 const Py_UNICODE *collstart = p;
5698 const Py_UNICODE *collend = p+1;
5699 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* find all untranslatable characters */
5702 while (collend < endp) {
5703 if (charmaptranslate_lookup(*collend, mapping, &x))
5704 goto onError;
5705 Py_XDECREF(x);
5706 if (x!=Py_None)
5707 break;
5708 ++collend;
5709 }
5710 /* cache callback name lookup
5711 * (if not done yet, i.e. it's the first error) */
5712 if (known_errorHandler==-1) {
5713 if ((errors==NULL) || (!strcmp(errors, "strict")))
5714 known_errorHandler = 1;
5715 else if (!strcmp(errors, "replace"))
5716 known_errorHandler = 2;
5717 else if (!strcmp(errors, "ignore"))
5718 known_errorHandler = 3;
5719 else if (!strcmp(errors, "xmlcharrefreplace"))
5720 known_errorHandler = 4;
5721 else
5722 known_errorHandler = 0;
5723 }
5724 switch (known_errorHandler) {
5725 case 1: /* strict */
5726 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005727 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 case 2: /* replace */
5729 /* No need to check for space, this is a 1:1 replacement */
5730 for (coll = collstart; coll<collend; ++coll)
5731 *str++ = '?';
5732 /* fall through */
5733 case 3: /* ignore */
5734 p = collend;
5735 break;
5736 case 4: /* xmlcharrefreplace */
5737 /* generate replacement (temporarily (mis)uses p) */
5738 for (p = collstart; p < collend; ++p) {
5739 char buffer[2+29+1+1];
5740 char *cp;
5741 sprintf(buffer, "&#%d;", (int)*p);
5742 if (charmaptranslate_makespace(&res, &str,
5743 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5744 goto onError;
5745 for (cp = buffer; *cp; ++cp)
5746 *str++ = *cp;
5747 }
5748 p = collend;
5749 break;
5750 default:
5751 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5752 reason, startp, size, &exc,
5753 collstart-startp, collend-startp, &newpos);
5754 if (repunicode == NULL)
5755 goto onError;
5756 /* generate replacement */
5757 repsize = PyUnicode_GET_SIZE(repunicode);
5758 if (charmaptranslate_makespace(&res, &str,
5759 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5760 Py_DECREF(repunicode);
5761 goto onError;
5762 }
5763 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5764 *str++ = *uni2;
5765 p = startp + newpos;
5766 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005768 }
5769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 /* Resize if we allocated to much */
5771 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005772 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 if (PyUnicode_Resize(&res, respos) < 0)
5774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 }
5776 Py_XDECREF(exc);
5777 Py_XDECREF(errorHandler);
5778 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 Py_XDECREF(res);
5782 Py_XDECREF(exc);
5783 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 return NULL;
5785}
5786
5787PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 PyObject *mapping,
5789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
5791 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005792
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 str = PyUnicode_FromObject(str);
5794 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 PyUnicode_GET_SIZE(str),
5798 mapping,
5799 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 Py_DECREF(str);
5801 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 Py_XDECREF(str);
5805 return NULL;
5806}
Tim Petersced69f82003-09-16 20:30:58 +00005807
Guido van Rossum9e896b32000-04-05 20:11:21 +00005808/* --- Decimal Encoder ---------------------------------------------------- */
5809
5810int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 Py_ssize_t length,
5812 char *output,
5813 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005814{
5815 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 PyObject *errorHandler = NULL;
5817 PyObject *exc = NULL;
5818 const char *encoding = "decimal";
5819 const char *reason = "invalid decimal Unicode string";
5820 /* the following variable is used for caching string comparisons
5821 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5822 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005823
5824 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 PyErr_BadArgument();
5826 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005827 }
5828
5829 p = s;
5830 end = s + length;
5831 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 register Py_UNICODE ch = *p;
5833 int decimal;
5834 PyObject *repunicode;
5835 Py_ssize_t repsize;
5836 Py_ssize_t newpos;
5837 Py_UNICODE *uni2;
5838 Py_UNICODE *collstart;
5839 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005842 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 ++p;
5844 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005845 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 decimal = Py_UNICODE_TODECIMAL(ch);
5847 if (decimal >= 0) {
5848 *output++ = '0' + decimal;
5849 ++p;
5850 continue;
5851 }
5852 if (0 < ch && ch < 256) {
5853 *output++ = (char)ch;
5854 ++p;
5855 continue;
5856 }
5857 /* All other characters are considered unencodable */
5858 collstart = p;
5859 collend = p+1;
5860 while (collend < end) {
5861 if ((0 < *collend && *collend < 256) ||
5862 !Py_UNICODE_ISSPACE(*collend) ||
5863 Py_UNICODE_TODECIMAL(*collend))
5864 break;
5865 }
5866 /* cache callback name lookup
5867 * (if not done yet, i.e. it's the first error) */
5868 if (known_errorHandler==-1) {
5869 if ((errors==NULL) || (!strcmp(errors, "strict")))
5870 known_errorHandler = 1;
5871 else if (!strcmp(errors, "replace"))
5872 known_errorHandler = 2;
5873 else if (!strcmp(errors, "ignore"))
5874 known_errorHandler = 3;
5875 else if (!strcmp(errors, "xmlcharrefreplace"))
5876 known_errorHandler = 4;
5877 else
5878 known_errorHandler = 0;
5879 }
5880 switch (known_errorHandler) {
5881 case 1: /* strict */
5882 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5883 goto onError;
5884 case 2: /* replace */
5885 for (p = collstart; p < collend; ++p)
5886 *output++ = '?';
5887 /* fall through */
5888 case 3: /* ignore */
5889 p = collend;
5890 break;
5891 case 4: /* xmlcharrefreplace */
5892 /* generate replacement (temporarily (mis)uses p) */
5893 for (p = collstart; p < collend; ++p)
5894 output += sprintf(output, "&#%d;", (int)*p);
5895 p = collend;
5896 break;
5897 default:
5898 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5899 encoding, reason, s, length, &exc,
5900 collstart-s, collend-s, &newpos);
5901 if (repunicode == NULL)
5902 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005903 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005904 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005905 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5906 Py_DECREF(repunicode);
5907 goto onError;
5908 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 /* generate replacement */
5910 repsize = PyUnicode_GET_SIZE(repunicode);
5911 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5912 Py_UNICODE ch = *uni2;
5913 if (Py_UNICODE_ISSPACE(ch))
5914 *output++ = ' ';
5915 else {
5916 decimal = Py_UNICODE_TODECIMAL(ch);
5917 if (decimal >= 0)
5918 *output++ = '0' + decimal;
5919 else if (0 < ch && ch < 256)
5920 *output++ = (char)ch;
5921 else {
5922 Py_DECREF(repunicode);
5923 raise_encode_exception(&exc, encoding,
5924 s, length, collstart-s, collend-s, reason);
5925 goto onError;
5926 }
5927 }
5928 }
5929 p = s + newpos;
5930 Py_DECREF(repunicode);
5931 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005932 }
5933 /* 0-terminate the output string */
5934 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 Py_XDECREF(exc);
5936 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005937 return 0;
5938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005940 Py_XDECREF(exc);
5941 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005942 return -1;
5943}
5944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945/* --- Helpers ------------------------------------------------------------ */
5946
Eric Smith8c663262007-08-25 02:26:07 +00005947#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005948#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005949
Thomas Wouters477c8d52006-05-27 19:21:47 +00005950#include "stringlib/count.h"
5951#include "stringlib/find.h"
5952#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005953#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954
Eric Smith5807c412008-05-11 21:00:57 +00005955#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005956#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005957#include "stringlib/localeutil.h"
5958
Thomas Wouters477c8d52006-05-27 19:21:47 +00005959/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005960#define ADJUST_INDICES(start, end, len) \
5961 if (end > len) \
5962 end = len; \
5963 else if (end < 0) { \
5964 end += len; \
5965 if (end < 0) \
5966 end = 0; \
5967 } \
5968 if (start < 0) { \
5969 start += len; \
5970 if (start < 0) \
5971 start = 0; \
5972 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973
Martin v. Löwis18e16552006-02-15 17:27:45 +00005974Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005975 PyObject *substr,
5976 Py_ssize_t start,
5977 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980 PyUnicodeObject* str_obj;
5981 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005982
Thomas Wouters477c8d52006-05-27 19:21:47 +00005983 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5984 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005986 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5987 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 Py_DECREF(str_obj);
5989 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
Tim Petersced69f82003-09-16 20:30:58 +00005991
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005992 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005993 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005994 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5995 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005996 );
5997
5998 Py_DECREF(sub_obj);
5999 Py_DECREF(str_obj);
6000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 return result;
6002}
6003
Martin v. Löwis18e16552006-02-15 17:27:45 +00006004Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006005 PyObject *sub,
6006 Py_ssize_t start,
6007 Py_ssize_t end,
6008 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006011
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006015 sub = PyUnicode_FromObject(sub);
6016 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 Py_DECREF(str);
6018 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 }
Tim Petersced69f82003-09-16 20:30:58 +00006020
Thomas Wouters477c8d52006-05-27 19:21:47 +00006021 if (direction > 0)
6022 result = stringlib_find_slice(
6023 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6024 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6025 start, end
6026 );
6027 else
6028 result = stringlib_rfind_slice(
6029 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6030 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6031 start, end
6032 );
6033
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006035 Py_DECREF(sub);
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return result;
6038}
6039
Tim Petersced69f82003-09-16 20:30:58 +00006040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 PyUnicodeObject *substring,
6043 Py_ssize_t start,
6044 Py_ssize_t end,
6045 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 if (substring->length == 0)
6048 return 1;
6049
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006050 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 end -= substring->length;
6052 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
6055 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 if (Py_UNICODE_MATCH(self, end, substring))
6057 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 } else {
6059 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 }
6062
6063 return 0;
6064}
6065
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 PyObject *substr,
6068 Py_ssize_t start,
6069 Py_ssize_t end,
6070 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 str = PyUnicode_FromObject(str);
6075 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 substr = PyUnicode_FromObject(substr);
6078 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 Py_DECREF(str);
6080 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Tim Petersced69f82003-09-16 20:30:58 +00006082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 (PyUnicodeObject *)substr,
6085 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 Py_DECREF(str);
6087 Py_DECREF(substr);
6088 return result;
6089}
6090
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091/* Apply fixfct filter to the Unicode object self and return a
6092 reference to the modified object */
6093
Tim Petersced69f82003-09-16 20:30:58 +00006094static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
6098
6099 PyUnicodeObject *u;
6100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006101 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006104
6105 Py_UNICODE_COPY(u->str, self->str, self->length);
6106
Tim Peters7a29bd52001-09-12 03:03:31 +00006107 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 /* fixfct should return TRUE if it modified the buffer. If
6109 FALSE, return a reference to the original buffer instead
6110 (to save space, not time) */
6111 Py_INCREF(self);
6112 Py_DECREF(u);
6113 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
6115 return (PyObject*) u;
6116}
6117
Tim Petersced69f82003-09-16 20:30:58 +00006118static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119int fixupper(PyUnicodeObject *self)
6120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 Py_UNICODE *s = self->str;
6123 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006127
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 ch = Py_UNICODE_TOUPPER(*s);
6129 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 *s = ch;
6132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 s++;
6134 }
6135
6136 return status;
6137}
6138
Tim Petersced69f82003-09-16 20:30:58 +00006139static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140int fixlower(PyUnicodeObject *self)
6141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006142 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 Py_UNICODE *s = self->str;
6144 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006148
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 ch = Py_UNICODE_TOLOWER(*s);
6150 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 *s = ch;
6153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 s++;
6155 }
6156
6157 return status;
6158}
6159
Tim Petersced69f82003-09-16 20:30:58 +00006160static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161int fixswapcase(PyUnicodeObject *self)
6162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006163 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 Py_UNICODE *s = self->str;
6165 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006166
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 while (len-- > 0) {
6168 if (Py_UNICODE_ISUPPER(*s)) {
6169 *s = Py_UNICODE_TOLOWER(*s);
6170 status = 1;
6171 } else if (Py_UNICODE_ISLOWER(*s)) {
6172 *s = Py_UNICODE_TOUPPER(*s);
6173 status = 1;
6174 }
6175 s++;
6176 }
6177
6178 return status;
6179}
6180
Tim Petersced69f82003-09-16 20:30:58 +00006181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182int fixcapitalize(PyUnicodeObject *self)
6183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006185 Py_UNICODE *s = self->str;
6186 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006187
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006188 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006190 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 *s = Py_UNICODE_TOUPPER(*s);
6192 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006194 s++;
6195 while (--len > 0) {
6196 if (Py_UNICODE_ISUPPER(*s)) {
6197 *s = Py_UNICODE_TOLOWER(*s);
6198 status = 1;
6199 }
6200 s++;
6201 }
6202 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
6205static
6206int fixtitle(PyUnicodeObject *self)
6207{
6208 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6209 register Py_UNICODE *e;
6210 int previous_is_cased;
6211
6212 /* Shortcut for single character strings */
6213 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6215 if (*p != ch) {
6216 *p = ch;
6217 return 1;
6218 }
6219 else
6220 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 }
Tim Petersced69f82003-09-16 20:30:58 +00006222
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 e = p + PyUnicode_GET_SIZE(self);
6224 previous_is_cased = 0;
6225 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006227
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 if (previous_is_cased)
6229 *p = Py_UNICODE_TOLOWER(ch);
6230 else
6231 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006232
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 if (Py_UNICODE_ISLOWER(ch) ||
6234 Py_UNICODE_ISUPPER(ch) ||
6235 Py_UNICODE_ISTITLE(ch))
6236 previous_is_cased = 1;
6237 else
6238 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
6240 return 1;
6241}
6242
Tim Peters8ce9f162004-08-27 01:49:32 +00006243PyObject *
6244PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245{
Skip Montanaro6543b452004-09-16 03:28:13 +00006246 const Py_UNICODE blank = ' ';
6247 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006248 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006249 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006250 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6251 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006252 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6253 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006254 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006255 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256
Tim Peters05eba1f2004-08-27 21:32:02 +00006257 fseq = PySequence_Fast(seq, "");
6258 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006259 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006260 }
6261
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006262 /* NOTE: the following code can't call back into Python code,
6263 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006264 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006265
Tim Peters05eba1f2004-08-27 21:32:02 +00006266 seqlen = PySequence_Fast_GET_SIZE(fseq);
6267 /* If empty sequence, return u"". */
6268 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006269 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6270 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006271 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006272 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006273 /* If singleton sequence with an exact Unicode, return that. */
6274 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 item = items[0];
6276 if (PyUnicode_CheckExact(item)) {
6277 Py_INCREF(item);
6278 res = (PyUnicodeObject *)item;
6279 goto Done;
6280 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006281 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006282 else {
6283 /* Set up sep and seplen */
6284 if (separator == NULL) {
6285 sep = &blank;
6286 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006287 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006288 else {
6289 if (!PyUnicode_Check(separator)) {
6290 PyErr_Format(PyExc_TypeError,
6291 "separator: expected str instance,"
6292 " %.80s found",
6293 Py_TYPE(separator)->tp_name);
6294 goto onError;
6295 }
6296 sep = PyUnicode_AS_UNICODE(separator);
6297 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006298 }
6299 }
6300
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006301 /* There are at least two things to join, or else we have a subclass
6302 * of str in the sequence.
6303 * Do a pre-pass to figure out the total amount of space we'll
6304 * need (sz), and see whether all argument are strings.
6305 */
6306 sz = 0;
6307 for (i = 0; i < seqlen; i++) {
6308 const Py_ssize_t old_sz = sz;
6309 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 if (!PyUnicode_Check(item)) {
6311 PyErr_Format(PyExc_TypeError,
6312 "sequence item %zd: expected str instance,"
6313 " %.80s found",
6314 i, Py_TYPE(item)->tp_name);
6315 goto onError;
6316 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006317 sz += PyUnicode_GET_SIZE(item);
6318 if (i != 0)
6319 sz += seplen;
6320 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6321 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006323 goto onError;
6324 }
6325 }
Tim Petersced69f82003-09-16 20:30:58 +00006326
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006327 res = _PyUnicode_New(sz);
6328 if (res == NULL)
6329 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006330
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006331 /* Catenate everything. */
6332 res_p = PyUnicode_AS_UNICODE(res);
6333 for (i = 0; i < seqlen; ++i) {
6334 Py_ssize_t itemlen;
6335 item = items[i];
6336 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* Copy item, and maybe the separator. */
6338 if (i) {
6339 Py_UNICODE_COPY(res_p, sep, seplen);
6340 res_p += seplen;
6341 }
6342 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6343 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006344 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006345
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006347 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 return (PyObject *)res;
6349
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006351 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006352 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 return NULL;
6354}
6355
Tim Petersced69f82003-09-16 20:30:58 +00006356static
6357PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 Py_ssize_t left,
6359 Py_ssize_t right,
6360 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361{
6362 PyUnicodeObject *u;
6363
6364 if (left < 0)
6365 left = 0;
6366 if (right < 0)
6367 right = 0;
6368
Tim Peters7a29bd52001-09-12 03:03:31 +00006369 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 Py_INCREF(self);
6371 return self;
6372 }
6373
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006374 if (left > PY_SSIZE_T_MAX - self->length ||
6375 right > PY_SSIZE_T_MAX - (left + self->length)) {
6376 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6377 return NULL;
6378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 u = _PyUnicode_New(left + self->length + right);
6380 if (u) {
6381 if (left)
6382 Py_UNICODE_FILL(u->str, fill, left);
6383 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6384 if (right)
6385 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6386 }
6387
6388 return u;
6389}
6390
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006391PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395 string = PyUnicode_FromObject(string);
6396 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006399 list = stringlib_splitlines(
6400 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6401 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
6403 Py_DECREF(string);
6404 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Tim Petersced69f82003-09-16 20:30:58 +00006407static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 PyUnicodeObject *substring,
6410 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006413 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006416 return stringlib_split_whitespace(
6417 (PyObject*) self, self->str, self->length, maxcount
6418 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006420 return stringlib_split(
6421 (PyObject*) self, self->str, self->length,
6422 substring->str, substring->length,
6423 maxcount
6424 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Tim Petersced69f82003-09-16 20:30:58 +00006427static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006428PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 PyUnicodeObject *substring,
6430 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006431{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006432 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006433 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006434
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006435 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006436 return stringlib_rsplit_whitespace(
6437 (PyObject*) self, self->str, self->length, maxcount
6438 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006439
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006440 return stringlib_rsplit(
6441 (PyObject*) self, self->str, self->length,
6442 substring->str, substring->length,
6443 maxcount
6444 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006445}
6446
6447static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 PyUnicodeObject *str1,
6450 PyUnicodeObject *str2,
6451 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452{
6453 PyUnicodeObject *u;
6454
6455 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006457 else if (maxcount == 0 || self->length == 0)
6458 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006461 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006462 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006463 if (str1->length == 0)
6464 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006465 if (str1->length == 1) {
6466 /* replace characters */
6467 Py_UNICODE u1, u2;
6468 if (!findchar(self->str, self->length, str1->str[0]))
6469 goto nothing;
6470 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6471 if (!u)
6472 return NULL;
6473 Py_UNICODE_COPY(u->str, self->str, self->length);
6474 u1 = str1->str[0];
6475 u2 = str2->str[0];
6476 for (i = 0; i < u->length; i++)
6477 if (u->str[i] == u1) {
6478 if (--maxcount < 0)
6479 break;
6480 u->str[i] = u2;
6481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006483 i = stringlib_find(
6484 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006486 if (i < 0)
6487 goto nothing;
6488 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6489 if (!u)
6490 return NULL;
6491 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492
6493 /* change everything in-place, starting with this one */
6494 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6495 i += str1->length;
6496
6497 while ( --maxcount > 0) {
6498 i = stringlib_find(self->str+i, self->length-i,
6499 str1->str, str1->length,
6500 i);
6501 if (i == -1)
6502 break;
6503 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6504 i += str1->length;
6505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006508
6509 Py_ssize_t n, i, j, e;
6510 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 Py_UNICODE *p;
6512
6513 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006514 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6515 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 if (n == 0)
6517 goto nothing;
6518 /* new_size = self->length + n * (str2->length - str1->length)); */
6519 delta = (str2->length - str1->length);
6520 if (delta == 0) {
6521 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 product = n * (str2->length - str1->length);
6524 if ((product / (str2->length - str1->length)) != n) {
6525 PyErr_SetString(PyExc_OverflowError,
6526 "replace string is too long");
6527 return NULL;
6528 }
6529 new_size = self->length + product;
6530 if (new_size < 0) {
6531 PyErr_SetString(PyExc_OverflowError,
6532 "replace string is too long");
6533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
6535 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 u = _PyUnicode_New(new_size);
6537 if (!u)
6538 return NULL;
6539 i = 0;
6540 p = u->str;
6541 e = self->length - str1->length;
6542 if (str1->length > 0) {
6543 while (n-- > 0) {
6544 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006545 j = stringlib_find(self->str+i, self->length-i,
6546 str1->str, str1->length,
6547 i);
6548 if (j == -1)
6549 break;
6550 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006551 /* copy unchanged part [i:j] */
6552 Py_UNICODE_COPY(p, self->str+i, j-i);
6553 p += j - i;
6554 }
6555 /* copy substitution string */
6556 if (str2->length > 0) {
6557 Py_UNICODE_COPY(p, str2->str, str2->length);
6558 p += str2->length;
6559 }
6560 i = j + str1->length;
6561 }
6562 if (i < self->length)
6563 /* copy tail [i:] */
6564 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6565 } else {
6566 /* interleave */
6567 while (n > 0) {
6568 Py_UNICODE_COPY(p, str2->str, str2->length);
6569 p += str2->length;
6570 if (--n <= 0)
6571 break;
6572 *p++ = self->str[i++];
6573 }
6574 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 /* nothing to replace; return original string (when possible) */
6581 if (PyUnicode_CheckExact(self)) {
6582 Py_INCREF(self);
6583 return (PyObject *) self;
6584 }
6585 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586}
6587
6588/* --- Unicode Object Methods --------------------------------------------- */
6589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006590PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592\n\
6593Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006597unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return fixup(self, fixtitle);
6600}
6601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604\n\
6605Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
6608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006609unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 return fixup(self, fixcapitalize);
6612}
6613
6614#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006615PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617\n\
6618Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006619normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620
6621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006622unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
6624 PyObject *list;
6625 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 /* Split into words */
6629 list = split(self, NULL, -1);
6630 if (!list)
6631 return NULL;
6632
6633 /* Capitalize each word */
6634 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6635 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 if (item == NULL)
6638 goto onError;
6639 Py_DECREF(PyList_GET_ITEM(list, i));
6640 PyList_SET_ITEM(list, i, item);
6641 }
6642
6643 /* Join the words to form a new string */
6644 item = PyUnicode_Join(NULL, list);
6645
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 Py_DECREF(list);
6648 return (PyObject *)item;
6649}
6650#endif
6651
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006652/* Argument converter. Coerces to a single unicode character */
6653
6654static int
6655convert_uc(PyObject *obj, void *addr)
6656{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6658 PyObject *uniobj;
6659 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006660
Benjamin Peterson14339b62009-01-31 16:36:08 +00006661 uniobj = PyUnicode_FromObject(obj);
6662 if (uniobj == NULL) {
6663 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006665 return 0;
6666 }
6667 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6668 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006670 Py_DECREF(uniobj);
6671 return 0;
6672 }
6673 unistr = PyUnicode_AS_UNICODE(uniobj);
6674 *fillcharloc = unistr[0];
6675 Py_DECREF(uniobj);
6676 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006677}
6678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006682Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006683done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685static PyObject *
6686unicode_center(PyUnicodeObject *self, PyObject *args)
6687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t marg, left;
6689 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006690 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
Thomas Woutersde017742006-02-16 19:34:37 +00006692 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 return NULL;
6694
Tim Peters7a29bd52001-09-12 03:03:31 +00006695 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 Py_INCREF(self);
6697 return (PyObject*) self;
6698 }
6699
6700 marg = width - self->length;
6701 left = marg / 2 + (marg & width & 1);
6702
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006703 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Marc-André Lemburge5034372000-08-08 08:04:29 +00006706#if 0
6707
6708/* This code should go into some future Unicode collation support
6709 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006710 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006711
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006712/* speedy UTF-16 code point order comparison */
6713/* gleaned from: */
6714/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6715
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006716static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006717{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006718 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006719 0, 0, 0, 0, 0, 0, 0, 0,
6720 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006721 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006722};
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724static int
6725unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6726{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 Py_UNICODE *s1 = str1->str;
6730 Py_UNICODE *s2 = str2->str;
6731
6732 len1 = str1->length;
6733 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006736 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006737
6738 c1 = *s1++;
6739 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006740
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 if (c1 > (1<<11) * 26)
6742 c1 += utf16Fixup[c1>>11];
6743 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006744 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006745 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006746
6747 if (c1 != c2)
6748 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006749
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006750 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
6752
6753 return (len1 < len2) ? -1 : (len1 != len2);
6754}
6755
Marc-André Lemburge5034372000-08-08 08:04:29 +00006756#else
6757
6758static int
6759unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6760{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006762
6763 Py_UNICODE *s1 = str1->str;
6764 Py_UNICODE *s2 = str2->str;
6765
6766 len1 = str1->length;
6767 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Marc-André Lemburge5034372000-08-08 08:04:29 +00006769 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006770 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006771
Fredrik Lundh45714e92001-06-26 16:39:36 +00006772 c1 = *s1++;
6773 c2 = *s2++;
6774
6775 if (c1 != c2)
6776 return (c1 < c2) ? -1 : 1;
6777
Marc-André Lemburge5034372000-08-08 08:04:29 +00006778 len1--; len2--;
6779 }
6780
6781 return (len1 < len2) ? -1 : (len1 != len2);
6782}
6783
6784#endif
6785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006789 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6790 return unicode_compare((PyUnicodeObject *)left,
6791 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006792 PyErr_Format(PyExc_TypeError,
6793 "Can't compare %.100s and %.100s",
6794 left->ob_type->tp_name,
6795 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 return -1;
6797}
6798
Martin v. Löwis5b222132007-06-10 09:51:05 +00006799int
6800PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6801{
6802 int i;
6803 Py_UNICODE *id;
6804 assert(PyUnicode_Check(uni));
6805 id = PyUnicode_AS_UNICODE(uni);
6806 /* Compare Unicode string and source character set string */
6807 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 if (id[i] != str[i])
6809 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006810 /* This check keeps Python strings that end in '\0' from comparing equal
6811 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006812 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006814 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006816 return 0;
6817}
6818
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006819
Benjamin Peterson29060642009-01-31 22:14:21 +00006820#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006821 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006822
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006823PyObject *PyUnicode_RichCompare(PyObject *left,
6824 PyObject *right,
6825 int op)
6826{
6827 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006829 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6830 PyObject *v;
6831 if (((PyUnicodeObject *) left)->length !=
6832 ((PyUnicodeObject *) right)->length) {
6833 if (op == Py_EQ) {
6834 Py_INCREF(Py_False);
6835 return Py_False;
6836 }
6837 if (op == Py_NE) {
6838 Py_INCREF(Py_True);
6839 return Py_True;
6840 }
6841 }
6842 if (left == right)
6843 result = 0;
6844 else
6845 result = unicode_compare((PyUnicodeObject *)left,
6846 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006847
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006848 /* Convert the return value to a Boolean */
6849 switch (op) {
6850 case Py_EQ:
6851 v = TEST_COND(result == 0);
6852 break;
6853 case Py_NE:
6854 v = TEST_COND(result != 0);
6855 break;
6856 case Py_LE:
6857 v = TEST_COND(result <= 0);
6858 break;
6859 case Py_GE:
6860 v = TEST_COND(result >= 0);
6861 break;
6862 case Py_LT:
6863 v = TEST_COND(result == -1);
6864 break;
6865 case Py_GT:
6866 v = TEST_COND(result == 1);
6867 break;
6868 default:
6869 PyErr_BadArgument();
6870 return NULL;
6871 }
6872 Py_INCREF(v);
6873 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006875
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006876 Py_INCREF(Py_NotImplemented);
6877 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006878}
6879
Guido van Rossum403d68b2000-03-13 15:55:09 +00006880int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006882{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006883 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006885
6886 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887 sub = PyUnicode_FromObject(element);
6888 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyErr_Format(PyExc_TypeError,
6890 "'in <string>' requires string as left operand, not %s",
6891 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006892 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006893 }
6894
Thomas Wouters477c8d52006-05-27 19:21:47 +00006895 str = PyUnicode_FromObject(container);
6896 if (!str) {
6897 Py_DECREF(sub);
6898 return -1;
6899 }
6900
6901 result = stringlib_contains_obj(str, sub);
6902
6903 Py_DECREF(str);
6904 Py_DECREF(sub);
6905
Guido van Rossum403d68b2000-03-13 15:55:09 +00006906 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006907}
6908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909/* Concat to string or Unicode object giving a new Unicode object. */
6910
6911PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913{
6914 PyUnicodeObject *u = NULL, *v = NULL, *w;
6915
6916 /* Coerce the two arguments */
6917 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6918 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6921 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923
6924 /* Shortcuts */
6925 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 Py_DECREF(v);
6927 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 }
6929 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 Py_DECREF(u);
6931 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 }
6933
6934 /* Concat the two Unicode strings */
6935 w = _PyUnicode_New(u->length + v->length);
6936 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 Py_UNICODE_COPY(w->str, u->str, u->length);
6939 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6940
6941 Py_DECREF(u);
6942 Py_DECREF(v);
6943 return (PyObject *)w;
6944
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 Py_XDECREF(u);
6947 Py_XDECREF(v);
6948 return NULL;
6949}
6950
Walter Dörwald1ab83302007-05-18 17:15:44 +00006951void
6952PyUnicode_Append(PyObject **pleft, PyObject *right)
6953{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006954 PyObject *new;
6955 if (*pleft == NULL)
6956 return;
6957 if (right == NULL || !PyUnicode_Check(*pleft)) {
6958 Py_DECREF(*pleft);
6959 *pleft = NULL;
6960 return;
6961 }
6962 new = PyUnicode_Concat(*pleft, right);
6963 Py_DECREF(*pleft);
6964 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006965}
6966
6967void
6968PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6969{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006970 PyUnicode_Append(pleft, right);
6971 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006972}
6973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006977Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006978string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006979interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980
6981static PyObject *
6982unicode_count(PyUnicodeObject *self, PyObject *args)
6983{
6984 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006985 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006986 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 PyObject *result;
6988
Guido van Rossumb8872e62000-05-09 14:14:27 +00006989 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 return NULL;
6992
6993 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006994 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006997
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006998 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006999 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007000 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007001 substring->str, substring->length,
7002 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007003 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
7005 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return result;
7008}
7009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007010PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007013Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007014to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007015handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7017'xmlcharrefreplace' as well as any other name registered with\n\
7018codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019
7020static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007021unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007023 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 char *encoding = NULL;
7025 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007026 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007027
Benjamin Peterson308d6372009-09-18 21:42:35 +00007028 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7029 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007031 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007032 if (v == NULL)
7033 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007034 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007035 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007036 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007037 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007038 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007039 Py_DECREF(v);
7040 return NULL;
7041 }
7042 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007043
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007045 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007046}
7047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007048PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050\n\
7051Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054static PyObject*
7055unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7056{
7057 Py_UNICODE *e;
7058 Py_UNICODE *p;
7059 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007060 Py_UNICODE *qe;
7061 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 PyUnicodeObject *u;
7063 int tabsize = 8;
7064
7065 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067
Thomas Wouters7e474022000-07-16 12:04:32 +00007068 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007069 i = 0; /* chars up to and including most recent \n or \r */
7070 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7071 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 for (p = self->str; p < e; p++)
7073 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 if (tabsize > 0) {
7075 incr = tabsize - (j % tabsize); /* cannot overflow */
7076 if (j > PY_SSIZE_T_MAX - incr)
7077 goto overflow1;
7078 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007079 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 if (j > PY_SSIZE_T_MAX - 1)
7083 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 j++;
7085 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 if (i > PY_SSIZE_T_MAX - j)
7087 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007089 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 }
7091 }
7092
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007093 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007095
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 /* Second pass: create output string and fill it */
7097 u = _PyUnicode_New(i + j);
7098 if (!u)
7099 return NULL;
7100
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007101 j = 0; /* same as in first pass */
7102 q = u->str; /* next output char */
7103 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105 for (p = self->str; p < e; p++)
7106 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 if (tabsize > 0) {
7108 i = tabsize - (j % tabsize);
7109 j += i;
7110 while (i--) {
7111 if (q >= qe)
7112 goto overflow2;
7113 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 else {
7118 if (q >= qe)
7119 goto overflow2;
7120 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007121 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 if (*p == '\n' || *p == '\r')
7123 j = 0;
7124 }
7125
7126 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007127
7128 overflow2:
7129 Py_DECREF(u);
7130 overflow1:
7131 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133}
7134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137\n\
7138Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007139such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140arguments start and end are interpreted as in slice notation.\n\
7141\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007142Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject *
7145unicode_find(PyUnicodeObject *self, PyObject *args)
7146{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007147 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007148 Py_ssize_t start;
7149 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007150 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
Christian Heimes9cd17752007-11-18 19:35:23 +00007152 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
Thomas Wouters477c8d52006-05-27 19:21:47 +00007155 result = stringlib_find_slice(
7156 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7157 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7158 start, end
7159 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
7161 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007162
Christian Heimes217cfd12007-12-02 14:31:20 +00007163 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164}
7165
7166static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007167unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168{
7169 if (index < 0 || index >= self->length) {
7170 PyErr_SetString(PyExc_IndexError, "string index out of range");
7171 return NULL;
7172 }
7173
7174 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7175}
7176
Guido van Rossumc2504932007-09-18 19:42:40 +00007177/* Believe it or not, this produces the same value for ASCII strings
7178 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007180unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181{
Guido van Rossumc2504932007-09-18 19:42:40 +00007182 Py_ssize_t len;
7183 Py_UNICODE *p;
7184 long x;
7185
7186 if (self->hash != -1)
7187 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007188 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007189 p = self->str;
7190 x = *p << 7;
7191 while (--len >= 0)
7192 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007193 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007194 if (x == -1)
7195 x = -2;
7196 self->hash = x;
7197 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198}
7199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007200PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007203Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204
7205static PyObject *
7206unicode_index(PyUnicodeObject *self, PyObject *args)
7207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007209 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007210 Py_ssize_t start;
7211 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
Christian Heimes9cd17752007-11-18 19:35:23 +00007213 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
Thomas Wouters477c8d52006-05-27 19:21:47 +00007216 result = stringlib_find_slice(
7217 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7218 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7219 start, end
7220 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221
7222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007223
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 if (result < 0) {
7225 PyErr_SetString(PyExc_ValueError, "substring not found");
7226 return NULL;
7227 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007228
Christian Heimes217cfd12007-12-02 14:31:20 +00007229 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230}
7231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007232PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007235Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007236at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237
7238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007239unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240{
7241 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7242 register const Py_UNICODE *e;
7243 int cased;
7244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 /* Shortcut for single character strings */
7246 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007249 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007250 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007252
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 e = p + PyUnicode_GET_SIZE(self);
7254 cased = 0;
7255 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007256 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007257
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7259 return PyBool_FromLong(0);
7260 else if (!cased && Py_UNICODE_ISLOWER(ch))
7261 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007263 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264}
7265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007266PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007269Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007270at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007273unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274{
7275 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7276 register const Py_UNICODE *e;
7277 int cased;
7278
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 /* Shortcut for single character strings */
7280 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007283 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007284 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007286
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 e = p + PyUnicode_GET_SIZE(self);
7288 cased = 0;
7289 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007291
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7293 return PyBool_FromLong(0);
7294 else if (!cased && Py_UNICODE_ISUPPER(ch))
7295 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007297 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298}
7299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007303Return True if S is a titlecased string and there is at least one\n\
7304character in S, i.e. upper- and titlecase characters may only\n\
7305follow uncased characters and lowercase characters only cased ones.\n\
7306Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307
7308static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007309unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310{
7311 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7312 register const Py_UNICODE *e;
7313 int cased, previous_is_cased;
7314
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 /* Shortcut for single character strings */
7316 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7318 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007320 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007321 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007323
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 e = p + PyUnicode_GET_SIZE(self);
7325 cased = 0;
7326 previous_is_cased = 0;
7327 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007329
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7331 if (previous_is_cased)
7332 return PyBool_FromLong(0);
7333 previous_is_cased = 1;
7334 cased = 1;
7335 }
7336 else if (Py_UNICODE_ISLOWER(ch)) {
7337 if (!previous_is_cased)
7338 return PyBool_FromLong(0);
7339 previous_is_cased = 1;
7340 cased = 1;
7341 }
7342 else
7343 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007345 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346}
7347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007351Return True if all characters in S are whitespace\n\
7352and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
7354static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007355unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356{
7357 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7358 register const Py_UNICODE *e;
7359
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 /* Shortcut for single character strings */
7361 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 Py_UNICODE_ISSPACE(*p))
7363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007365 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007366 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007368
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 e = p + PyUnicode_GET_SIZE(self);
7370 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 if (!Py_UNICODE_ISSPACE(*p))
7372 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007374 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375}
7376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007377PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007379\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007380Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007381and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007382
7383static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007384unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007385{
7386 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7387 register const Py_UNICODE *e;
7388
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007389 /* Shortcut for single character strings */
7390 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 Py_UNICODE_ISALPHA(*p))
7392 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007393
7394 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007395 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007397
7398 e = p + PyUnicode_GET_SIZE(self);
7399 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 if (!Py_UNICODE_ISALPHA(*p))
7401 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007402 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007403 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007404}
7405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007406PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007409Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007410and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007411
7412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007413unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007414{
7415 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7416 register const Py_UNICODE *e;
7417
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007418 /* Shortcut for single character strings */
7419 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 Py_UNICODE_ISALNUM(*p))
7421 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007422
7423 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007424 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007426
7427 e = p + PyUnicode_GET_SIZE(self);
7428 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 if (!Py_UNICODE_ISALNUM(*p))
7430 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007431 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007432 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007433}
7434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007439False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
7441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007442unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443{
7444 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7445 register const Py_UNICODE *e;
7446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 /* Shortcut for single character strings */
7448 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 Py_UNICODE_ISDECIMAL(*p))
7450 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007452 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007453 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007455
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 e = p + PyUnicode_GET_SIZE(self);
7457 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 if (!Py_UNICODE_ISDECIMAL(*p))
7459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007461 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462}
7463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007464PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007467Return True if all characters in S are digits\n\
7468and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
7470static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007471unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472{
7473 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7474 register const Py_UNICODE *e;
7475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 /* Shortcut for single character strings */
7477 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 Py_UNICODE_ISDIGIT(*p))
7479 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007481 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007482 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 e = p + PyUnicode_GET_SIZE(self);
7486 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 if (!Py_UNICODE_ISDIGIT(*p))
7488 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007490 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491}
7492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007497False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
7499static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007500unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501{
7502 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7503 register const Py_UNICODE *e;
7504
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 /* Shortcut for single character strings */
7506 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 Py_UNICODE_ISNUMERIC(*p))
7508 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007510 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007511 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007513
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 e = p + PyUnicode_GET_SIZE(self);
7515 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 if (!Py_UNICODE_ISNUMERIC(*p))
7517 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007519 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520}
7521
Martin v. Löwis47383402007-08-15 07:32:56 +00007522int
7523PyUnicode_IsIdentifier(PyObject *self)
7524{
7525 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7526 register const Py_UNICODE *e;
7527
7528 /* Special case for empty strings */
7529 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007531
7532 /* PEP 3131 says that the first character must be in
7533 XID_Start and subsequent characters in XID_Continue,
7534 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007535 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007536 letters, digits, underscore). However, given the current
7537 definition of XID_Start and XID_Continue, it is sufficient
7538 to check just for these, except that _ must be allowed
7539 as starting an identifier. */
7540 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7541 return 0;
7542
7543 e = p + PyUnicode_GET_SIZE(self);
7544 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 if (!_PyUnicode_IsXidContinue(*p))
7546 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007547 }
7548 return 1;
7549}
7550
7551PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007553\n\
7554Return True if S is a valid identifier according\n\
7555to the language definition.");
7556
7557static PyObject*
7558unicode_isidentifier(PyObject *self)
7559{
7560 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7561}
7562
Georg Brandl559e5d72008-06-11 18:37:52 +00007563PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007565\n\
7566Return True if all characters in S are considered\n\
7567printable in repr() or S is empty, False otherwise.");
7568
7569static PyObject*
7570unicode_isprintable(PyObject *self)
7571{
7572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7573 register const Py_UNICODE *e;
7574
7575 /* Shortcut for single character strings */
7576 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7577 Py_RETURN_TRUE;
7578 }
7579
7580 e = p + PyUnicode_GET_SIZE(self);
7581 for (; p < e; p++) {
7582 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7583 Py_RETURN_FALSE;
7584 }
7585 }
7586 Py_RETURN_TRUE;
7587}
7588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007589PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007590 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591\n\
7592Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007593iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594
7595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007596unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007598 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599}
7600
Martin v. Löwis18e16552006-02-15 17:27:45 +00007601static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602unicode_length(PyUnicodeObject *self)
7603{
7604 return self->length;
7605}
7606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007610Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007611done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613static PyObject *
7614unicode_ljust(PyUnicodeObject *self, PyObject *args)
7615{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007616 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007617 Py_UNICODE fillchar = ' ';
7618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007619 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 return NULL;
7621
Tim Peters7a29bd52001-09-12 03:03:31 +00007622 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623 Py_INCREF(self);
7624 return (PyObject*) self;
7625 }
7626
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007627 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628}
7629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007630PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
7635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007636unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 return fixup(self, fixlower);
7639}
7640
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007641#define LEFTSTRIP 0
7642#define RIGHTSTRIP 1
7643#define BOTHSTRIP 2
7644
7645/* Arrays indexed by above */
7646static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7647
7648#define STRIPNAME(i) (stripformat[i]+3)
7649
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007650/* externally visible for str.strip(unicode) */
7651PyObject *
7652_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7653{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007654 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7655 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7656 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7657 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7658 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007659
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007661
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 i = 0;
7663 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7665 i++;
7666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007667 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007668
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 j = len;
7670 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 do {
7672 j--;
7673 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7674 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_INCREF(self);
7679 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007680 }
7681 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683}
7684
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007687do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007689 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7690 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007691
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 i = 0;
7693 if (striptype != RIGHTSTRIP) {
7694 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7695 i++;
7696 }
7697 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007698
Benjamin Peterson14339b62009-01-31 16:36:08 +00007699 j = len;
7700 if (striptype != LEFTSTRIP) {
7701 do {
7702 j--;
7703 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7704 j++;
7705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007706
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7708 Py_INCREF(self);
7709 return (PyObject*)self;
7710 }
7711 else
7712 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713}
7714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007715
7716static PyObject *
7717do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007719 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007720
Benjamin Peterson14339b62009-01-31 16:36:08 +00007721 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7722 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007723
Benjamin Peterson14339b62009-01-31 16:36:08 +00007724 if (sep != NULL && sep != Py_None) {
7725 if (PyUnicode_Check(sep))
7726 return _PyUnicode_XStrip(self, striptype, sep);
7727 else {
7728 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 "%s arg must be None or str",
7730 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007731 return NULL;
7732 }
7733 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007734
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007736}
7737
7738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007739PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007741\n\
7742Return a copy of the string S with leading and trailing\n\
7743whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007744If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007745
7746static PyObject *
7747unicode_strip(PyUnicodeObject *self, PyObject *args)
7748{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 if (PyTuple_GET_SIZE(args) == 0)
7750 return do_strip(self, BOTHSTRIP); /* Common case */
7751 else
7752 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007753}
7754
7755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007758\n\
7759Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007760If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007761
7762static PyObject *
7763unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007765 if (PyTuple_GET_SIZE(args) == 0)
7766 return do_strip(self, LEFTSTRIP); /* Common case */
7767 else
7768 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769}
7770
7771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007772PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007774\n\
7775Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007776If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007777
7778static PyObject *
7779unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 if (PyTuple_GET_SIZE(args) == 0)
7782 return do_strip(self, RIGHTSTRIP); /* Common case */
7783 else
7784 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007785}
7786
7787
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007789unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790{
7791 PyUnicodeObject *u;
7792 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007793 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007794 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
Georg Brandl222de0f2009-04-12 12:01:50 +00007796 if (len < 1) {
7797 Py_INCREF(unicode_empty);
7798 return (PyObject *)unicode_empty;
7799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
Tim Peters7a29bd52001-09-12 03:03:31 +00007801 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 /* no repeat, return original string */
7803 Py_INCREF(str);
7804 return (PyObject*) str;
7805 }
Tim Peters8f422462000-09-09 06:13:41 +00007806
7807 /* ensure # of chars needed doesn't overflow int and # of bytes
7808 * needed doesn't overflow size_t
7809 */
7810 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007811 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007812 PyErr_SetString(PyExc_OverflowError,
7813 "repeated string is too long");
7814 return NULL;
7815 }
7816 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7817 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7818 PyErr_SetString(PyExc_OverflowError,
7819 "repeated string is too long");
7820 return NULL;
7821 }
7822 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 if (!u)
7824 return NULL;
7825
7826 p = u->str;
7827
Georg Brandl222de0f2009-04-12 12:01:50 +00007828 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007829 Py_UNICODE_FILL(p, str->str[0], len);
7830 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007831 Py_ssize_t done = str->length; /* number of characters copied this far */
7832 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007834 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007835 Py_UNICODE_COPY(p+done, p, n);
7836 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
7839
7840 return (PyObject*) u;
7841}
7842
7843PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 PyObject *subobj,
7845 PyObject *replobj,
7846 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847{
7848 PyObject *self;
7849 PyObject *str1;
7850 PyObject *str2;
7851 PyObject *result;
7852
7853 self = PyUnicode_FromObject(obj);
7854 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 str1 = PyUnicode_FromObject(subobj);
7857 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 Py_DECREF(self);
7859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 }
7861 str2 = PyUnicode_FromObject(replobj);
7862 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 Py_DECREF(self);
7864 Py_DECREF(str1);
7865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 }
Tim Petersced69f82003-09-16 20:30:58 +00007867 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 (PyUnicodeObject *)str1,
7869 (PyUnicodeObject *)str2,
7870 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 Py_DECREF(self);
7872 Py_DECREF(str1);
7873 Py_DECREF(str2);
7874 return result;
7875}
7876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007877PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879\n\
7880Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007881old replaced by new. If the optional argument count is\n\
7882given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883
7884static PyObject*
7885unicode_replace(PyUnicodeObject *self, PyObject *args)
7886{
7887 PyUnicodeObject *str1;
7888 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 PyObject *result;
7891
Martin v. Löwis18e16552006-02-15 17:27:45 +00007892 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return NULL;
7894 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7895 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007898 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 Py_DECREF(str1);
7900 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
7903 result = replace(self, str1, str2, maxcount);
7904
7905 Py_DECREF(str1);
7906 Py_DECREF(str2);
7907 return result;
7908}
7909
7910static
7911PyObject *unicode_repr(PyObject *unicode)
7912{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007913 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007914 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007915 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7916 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7917
7918 /* XXX(nnorwitz): rather than over-allocating, it would be
7919 better to choose a different scheme. Perhaps scan the
7920 first N-chars of the string and allocate based on that size.
7921 */
7922 /* Initial allocation is based on the longest-possible unichr
7923 escape.
7924
7925 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7926 unichr, so in this case it's the longest unichr escape. In
7927 narrow (UTF-16) builds this is five chars per source unichr
7928 since there are two unichrs in the surrogate pair, so in narrow
7929 (UTF-16) builds it's not the longest unichr escape.
7930
7931 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7932 so in the narrow (UTF-16) build case it's the longest unichr
7933 escape.
7934 */
7935
Walter Dörwald1ab83302007-05-18 17:15:44 +00007936 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007938#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007940#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007942#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007944 if (repr == NULL)
7945 return NULL;
7946
Walter Dörwald1ab83302007-05-18 17:15:44 +00007947 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007948
7949 /* Add quote */
7950 *p++ = (findchar(s, size, '\'') &&
7951 !findchar(s, size, '"')) ? '"' : '\'';
7952 while (size-- > 0) {
7953 Py_UNICODE ch = *s++;
7954
7955 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007956 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007957 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007958 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007959 continue;
7960 }
7961
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007963 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007964 *p++ = '\\';
7965 *p++ = 't';
7966 }
7967 else if (ch == '\n') {
7968 *p++ = '\\';
7969 *p++ = 'n';
7970 }
7971 else if (ch == '\r') {
7972 *p++ = '\\';
7973 *p++ = 'r';
7974 }
7975
7976 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007977 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007978 *p++ = '\\';
7979 *p++ = 'x';
7980 *p++ = hexdigits[(ch >> 4) & 0x000F];
7981 *p++ = hexdigits[ch & 0x000F];
7982 }
7983
Georg Brandl559e5d72008-06-11 18:37:52 +00007984 /* Copy ASCII characters as-is */
7985 else if (ch < 0x7F) {
7986 *p++ = ch;
7987 }
7988
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007990 else {
7991 Py_UCS4 ucs = ch;
7992
7993#ifndef Py_UNICODE_WIDE
7994 Py_UNICODE ch2 = 0;
7995 /* Get code point from surrogate pair */
7996 if (size > 0) {
7997 ch2 = *s;
7998 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008003 size--;
8004 }
8005 }
8006#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008008 (categories Z* and C* except ASCII space)
8009 */
8010 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8011 /* Map 8-bit characters to '\xhh' */
8012 if (ucs <= 0xff) {
8013 *p++ = '\\';
8014 *p++ = 'x';
8015 *p++ = hexdigits[(ch >> 4) & 0x000F];
8016 *p++ = hexdigits[ch & 0x000F];
8017 }
8018 /* Map 21-bit characters to '\U00xxxxxx' */
8019 else if (ucs >= 0x10000) {
8020 *p++ = '\\';
8021 *p++ = 'U';
8022 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8023 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8024 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8025 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8026 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8027 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8028 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8029 *p++ = hexdigits[ucs & 0x0000000F];
8030 }
8031 /* Map 16-bit characters to '\uxxxx' */
8032 else {
8033 *p++ = '\\';
8034 *p++ = 'u';
8035 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8036 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8037 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8038 *p++ = hexdigits[ucs & 0x000F];
8039 }
8040 }
8041 /* Copy characters as-is */
8042 else {
8043 *p++ = ch;
8044#ifndef Py_UNICODE_WIDE
8045 if (ucs >= 0x10000)
8046 *p++ = ch2;
8047#endif
8048 }
8049 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008050 }
8051 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008052 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008053
8054 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008055 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008056 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057}
8058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008059PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061\n\
8062Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008063such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064arguments start and end are interpreted as in slice notation.\n\
8065\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008066Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
8068static PyObject *
8069unicode_rfind(PyUnicodeObject *self, PyObject *args)
8070{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008071 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008072 Py_ssize_t start;
8073 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008074 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
Christian Heimes9cd17752007-11-18 19:35:23 +00008076 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078
Thomas Wouters477c8d52006-05-27 19:21:47 +00008079 result = stringlib_rfind_slice(
8080 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8081 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8082 start, end
8083 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
8085 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008086
Christian Heimes217cfd12007-12-02 14:31:20 +00008087 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088}
8089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008090PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008093Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094
8095static PyObject *
8096unicode_rindex(PyUnicodeObject *self, PyObject *args)
8097{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008098 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008099 Py_ssize_t start;
8100 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008101 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102
Christian Heimes9cd17752007-11-18 19:35:23 +00008103 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
Thomas Wouters477c8d52006-05-27 19:21:47 +00008106 result = stringlib_rfind_slice(
8107 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8108 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8109 start, end
8110 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111
8112 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 if (result < 0) {
8115 PyErr_SetString(PyExc_ValueError, "substring not found");
8116 return NULL;
8117 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008118 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119}
8120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008121PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008124Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008125done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
8127static PyObject *
8128unicode_rjust(PyUnicodeObject *self, PyObject *args)
8129{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008130 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008131 Py_UNICODE fillchar = ' ';
8132
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008133 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 return NULL;
8135
Tim Peters7a29bd52001-09-12 03:03:31 +00008136 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 Py_INCREF(self);
8138 return (PyObject*) self;
8139 }
8140
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008141 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142}
8143
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 PyObject *sep,
8146 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147{
8148 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 s = PyUnicode_FromObject(s);
8151 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 if (sep != NULL) {
8154 sep = PyUnicode_FromObject(sep);
8155 if (sep == NULL) {
8156 Py_DECREF(s);
8157 return NULL;
8158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 }
8160
8161 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8162
8163 Py_DECREF(s);
8164 Py_XDECREF(sep);
8165 return result;
8166}
8167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008168PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170\n\
8171Return a list of the words in S, using sep as the\n\
8172delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008173splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008174whitespace string is a separator and empty strings are\n\
8175removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176
8177static PyObject*
8178unicode_split(PyUnicodeObject *self, PyObject *args)
8179{
8180 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
Martin v. Löwis18e16552006-02-15 17:27:45 +00008183 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 return NULL;
8185
8186 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192}
8193
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194PyObject *
8195PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8196{
8197 PyObject* str_obj;
8198 PyObject* sep_obj;
8199 PyObject* out;
8200
8201 str_obj = PyUnicode_FromObject(str_in);
8202 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008204 sep_obj = PyUnicode_FromObject(sep_in);
8205 if (!sep_obj) {
8206 Py_DECREF(str_obj);
8207 return NULL;
8208 }
8209
8210 out = stringlib_partition(
8211 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8212 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8213 );
8214
8215 Py_DECREF(sep_obj);
8216 Py_DECREF(str_obj);
8217
8218 return out;
8219}
8220
8221
8222PyObject *
8223PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8224{
8225 PyObject* str_obj;
8226 PyObject* sep_obj;
8227 PyObject* out;
8228
8229 str_obj = PyUnicode_FromObject(str_in);
8230 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 sep_obj = PyUnicode_FromObject(sep_in);
8233 if (!sep_obj) {
8234 Py_DECREF(str_obj);
8235 return NULL;
8236 }
8237
8238 out = stringlib_rpartition(
8239 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8240 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8241 );
8242
8243 Py_DECREF(sep_obj);
8244 Py_DECREF(str_obj);
8245
8246 return out;
8247}
8248
8249PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008251\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008252Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008254found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008255
8256static PyObject*
8257unicode_partition(PyUnicodeObject *self, PyObject *separator)
8258{
8259 return PyUnicode_Partition((PyObject *)self, separator);
8260}
8261
8262PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008263 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008264\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008265Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008267separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008268
8269static PyObject*
8270unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8271{
8272 return PyUnicode_RPartition((PyObject *)self, separator);
8273}
8274
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008275PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 PyObject *sep,
8277 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008278{
8279 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008281 s = PyUnicode_FromObject(s);
8282 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 if (sep != NULL) {
8285 sep = PyUnicode_FromObject(sep);
8286 if (sep == NULL) {
8287 Py_DECREF(s);
8288 return NULL;
8289 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008290 }
8291
8292 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8293
8294 Py_DECREF(s);
8295 Py_XDECREF(sep);
8296 return result;
8297}
8298
8299PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008301\n\
8302Return a list of the words in S, using sep as the\n\
8303delimiter string, starting at the end of the string and\n\
8304working to the front. If maxsplit is given, at most maxsplit\n\
8305splits are done. If sep is not specified, any whitespace string\n\
8306is a separator.");
8307
8308static PyObject*
8309unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8310{
8311 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008312 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008313
Martin v. Löwis18e16552006-02-15 17:27:45 +00008314 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008315 return NULL;
8316
8317 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008319 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008321 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008323}
8324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008325PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327\n\
8328Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008329Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008330is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331
8332static PyObject*
8333unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8334{
Guido van Rossum86662912000-04-11 15:38:46 +00008335 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336
Guido van Rossum86662912000-04-11 15:38:46 +00008337 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 return NULL;
8339
Guido van Rossum86662912000-04-11 15:38:46 +00008340 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341}
8342
8343static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008344PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345{
Walter Dörwald346737f2007-05-31 10:44:43 +00008346 if (PyUnicode_CheckExact(self)) {
8347 Py_INCREF(self);
8348 return self;
8349 } else
8350 /* Subtype -- return genuine unicode string with the same value. */
8351 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8352 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353}
8354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008355PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357\n\
8358Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008359and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360
8361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008362unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 return fixup(self, fixswapcase);
8365}
8366
Georg Brandlceee0772007-11-27 23:48:05 +00008367PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008369\n\
8370Return a translation table usable for str.translate().\n\
8371If there is only one argument, it must be a dictionary mapping Unicode\n\
8372ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008373Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008374If there are two arguments, they must be strings of equal length, and\n\
8375in the resulting dictionary, each character in x will be mapped to the\n\
8376character at the same position in y. If there is a third argument, it\n\
8377must be a string, whose characters will be mapped to None in the result.");
8378
8379static PyObject*
8380unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8381{
8382 PyObject *x, *y = NULL, *z = NULL;
8383 PyObject *new = NULL, *key, *value;
8384 Py_ssize_t i = 0;
8385 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008386
Georg Brandlceee0772007-11-27 23:48:05 +00008387 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8388 return NULL;
8389 new = PyDict_New();
8390 if (!new)
8391 return NULL;
8392 if (y != NULL) {
8393 /* x must be a string too, of equal length */
8394 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8395 if (!PyUnicode_Check(x)) {
8396 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8397 "be a string if there is a second argument");
8398 goto err;
8399 }
8400 if (PyUnicode_GET_SIZE(x) != ylen) {
8401 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8402 "arguments must have equal length");
8403 goto err;
8404 }
8405 /* create entries for translating chars in x to those in y */
8406 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008407 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8408 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008409 if (!key || !value)
8410 goto err;
8411 res = PyDict_SetItem(new, key, value);
8412 Py_DECREF(key);
8413 Py_DECREF(value);
8414 if (res < 0)
8415 goto err;
8416 }
8417 /* create entries for deleting chars in z */
8418 if (z != NULL) {
8419 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008420 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008421 if (!key)
8422 goto err;
8423 res = PyDict_SetItem(new, key, Py_None);
8424 Py_DECREF(key);
8425 if (res < 0)
8426 goto err;
8427 }
8428 }
8429 } else {
8430 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008431 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008432 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8433 "to maketrans it must be a dict");
8434 goto err;
8435 }
8436 /* copy entries into the new dict, converting string keys to int keys */
8437 while (PyDict_Next(x, &i, &key, &value)) {
8438 if (PyUnicode_Check(key)) {
8439 /* convert string keys to integer keys */
8440 PyObject *newkey;
8441 if (PyUnicode_GET_SIZE(key) != 1) {
8442 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8443 "table must be of length 1");
8444 goto err;
8445 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008446 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008447 if (!newkey)
8448 goto err;
8449 res = PyDict_SetItem(new, newkey, value);
8450 Py_DECREF(newkey);
8451 if (res < 0)
8452 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008453 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008454 /* just keep integer keys */
8455 if (PyDict_SetItem(new, key, value) < 0)
8456 goto err;
8457 } else {
8458 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8459 "be strings or integers");
8460 goto err;
8461 }
8462 }
8463 }
8464 return new;
8465 err:
8466 Py_DECREF(new);
8467 return NULL;
8468}
8469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008470PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472\n\
8473Return a copy of the string S, where all characters have been mapped\n\
8474through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008475Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008476Unmapped characters are left untouched. Characters mapped to None\n\
8477are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
8479static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008480unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481{
Georg Brandlceee0772007-11-27 23:48:05 +00008482 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483}
8484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008485PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008488Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489
8490static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008491unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 return fixup(self, fixupper);
8494}
8495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008496PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008499Pad a numeric string S with zeros on the left, to fill a field\n\
8500of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501
8502static PyObject *
8503unicode_zfill(PyUnicodeObject *self, PyObject *args)
8504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008505 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 PyUnicodeObject *u;
8507
Martin v. Löwis18e16552006-02-15 17:27:45 +00008508 Py_ssize_t width;
8509 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 return NULL;
8511
8512 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008513 if (PyUnicode_CheckExact(self)) {
8514 Py_INCREF(self);
8515 return (PyObject*) self;
8516 }
8517 else
8518 return PyUnicode_FromUnicode(
8519 PyUnicode_AS_UNICODE(self),
8520 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 }
8523
8524 fill = width - self->length;
8525
8526 u = pad(self, fill, 0, '0');
8527
Walter Dörwald068325e2002-04-15 13:36:47 +00008528 if (u == NULL)
8529 return NULL;
8530
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 if (u->str[fill] == '+' || u->str[fill] == '-') {
8532 /* move sign to beginning of string */
8533 u->str[0] = u->str[fill];
8534 u->str[fill] = '0';
8535 }
8536
8537 return (PyObject*) u;
8538}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539
8540#if 0
8541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008542unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Christian Heimes2202f872008-02-06 14:31:34 +00008544 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545}
8546#endif
8547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008548PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008551Return True if S starts with the specified prefix, False otherwise.\n\
8552With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008553With optional end, stop comparing S at that position.\n\
8554prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
8556static PyObject *
8557unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008560 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008562 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008563 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008564 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008566 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8568 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008569 if (PyTuple_Check(subobj)) {
8570 Py_ssize_t i;
8571 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8572 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008574 if (substring == NULL)
8575 return NULL;
8576 result = tailmatch(self, substring, start, end, -1);
8577 Py_DECREF(substring);
8578 if (result) {
8579 Py_RETURN_TRUE;
8580 }
8581 }
8582 /* nothing matched */
8583 Py_RETURN_FALSE;
8584 }
8585 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008588 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008590 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591}
8592
8593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008594PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008597Return True if S ends with the specified suffix, False otherwise.\n\
8598With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008599With optional end, stop comparing S at that position.\n\
8600suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
8602static PyObject *
8603unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008606 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008608 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008609 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008610 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008612 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8614 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008615 if (PyTuple_Check(subobj)) {
8616 Py_ssize_t i;
8617 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8618 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008620 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008622 result = tailmatch(self, substring, start, end, +1);
8623 Py_DECREF(substring);
8624 if (result) {
8625 Py_RETURN_TRUE;
8626 }
8627 }
8628 Py_RETURN_FALSE;
8629 }
8630 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008634 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008636 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637}
8638
Eric Smith8c663262007-08-25 02:26:07 +00008639#include "stringlib/string_format.h"
8640
8641PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008643\n\
8644");
8645
Eric Smith4a7d76d2008-05-30 18:10:19 +00008646static PyObject *
8647unicode__format__(PyObject* self, PyObject* args)
8648{
8649 PyObject *format_spec;
8650
8651 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8652 return NULL;
8653
8654 return _PyUnicode_FormatAdvanced(self,
8655 PyUnicode_AS_UNICODE(format_spec),
8656 PyUnicode_GET_SIZE(format_spec));
8657}
8658
Eric Smith8c663262007-08-25 02:26:07 +00008659PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008661\n\
8662");
8663
8664static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008665unicode__sizeof__(PyUnicodeObject *v)
8666{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008667 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8668 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008669}
8670
8671PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008673
8674static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008675unicode_getnewargs(PyUnicodeObject *v)
8676{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008677 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008678}
8679
8680
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681static PyMethodDef unicode_methods[] = {
8682
8683 /* Order is according to common usage: often used methods should
8684 appear first, since lookup is done sequentially. */
8685
Benjamin Peterson308d6372009-09-18 21:42:35 +00008686 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008687 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8688 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008689 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008690 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8691 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8692 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8693 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8694 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8695 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8696 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008697 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008698 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8699 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8700 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008701 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008702 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8703 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8704 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008705 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008706 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008707 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008708 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008709 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8710 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8711 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8712 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8713 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8714 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8715 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8716 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8717 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8718 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8719 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8720 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8721 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8722 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008723 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008724 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008725 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008726 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008727 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008728 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8729 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008730 {"maketrans", (PyCFunction) unicode_maketrans,
8731 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008732 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008733#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008734 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735#endif
8736
8737#if 0
8738 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008739 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740#endif
8741
Benjamin Peterson14339b62009-01-31 16:36:08 +00008742 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 {NULL, NULL}
8744};
8745
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008746static PyObject *
8747unicode_mod(PyObject *v, PyObject *w)
8748{
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 if (!PyUnicode_Check(v)) {
8750 Py_INCREF(Py_NotImplemented);
8751 return Py_NotImplemented;
8752 }
8753 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008754}
8755
8756static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 0, /*nb_add*/
8758 0, /*nb_subtract*/
8759 0, /*nb_multiply*/
8760 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008761};
8762
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008764 (lenfunc) unicode_length, /* sq_length */
8765 PyUnicode_Concat, /* sq_concat */
8766 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8767 (ssizeargfunc) unicode_getitem, /* sq_item */
8768 0, /* sq_slice */
8769 0, /* sq_ass_item */
8770 0, /* sq_ass_slice */
8771 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772};
8773
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008774static PyObject*
8775unicode_subscript(PyUnicodeObject* self, PyObject* item)
8776{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008777 if (PyIndex_Check(item)) {
8778 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008779 if (i == -1 && PyErr_Occurred())
8780 return NULL;
8781 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008782 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008783 return unicode_getitem(self, i);
8784 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008785 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008786 Py_UNICODE* source_buf;
8787 Py_UNICODE* result_buf;
8788 PyObject* result;
8789
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008790 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008792 return NULL;
8793 }
8794
8795 if (slicelength <= 0) {
8796 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008797 } else if (start == 0 && step == 1 && slicelength == self->length &&
8798 PyUnicode_CheckExact(self)) {
8799 Py_INCREF(self);
8800 return (PyObject *)self;
8801 } else if (step == 1) {
8802 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008803 } else {
8804 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008805 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8806 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008807
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 if (result_buf == NULL)
8809 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008810
8811 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8812 result_buf[i] = source_buf[cur];
8813 }
Tim Petersced69f82003-09-16 20:30:58 +00008814
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008815 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008816 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008817 return result;
8818 }
8819 } else {
8820 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8821 return NULL;
8822 }
8823}
8824
8825static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008826 (lenfunc)unicode_length, /* mp_length */
8827 (binaryfunc)unicode_subscript, /* mp_subscript */
8828 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008829};
8830
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832/* Helpers for PyUnicode_Format() */
8833
8834static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008835getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 (*p_argidx)++;
8840 if (arglen < 0)
8841 return args;
8842 else
8843 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 }
8845 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 return NULL;
8848}
8849
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008850/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008852static PyObject *
8853formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008855 char *p;
8856 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008858
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 x = PyFloat_AsDouble(v);
8860 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008861 return NULL;
8862
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008865
Eric Smith0923d1d2009-04-16 20:16:10 +00008866 p = PyOS_double_to_string(x, type, prec,
8867 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008868 if (p == NULL)
8869 return NULL;
8870 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008871 PyMem_Free(p);
8872 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873}
8874
Tim Peters38fd5b62000-09-21 05:43:11 +00008875static PyObject*
8876formatlong(PyObject *val, int flags, int prec, int type)
8877{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008878 char *buf;
8879 int len;
8880 PyObject *str; /* temporary string object. */
8881 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008882
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8884 if (!str)
8885 return NULL;
8886 result = PyUnicode_FromStringAndSize(buf, len);
8887 Py_DECREF(str);
8888 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008889}
8890
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891static int
8892formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008893 size_t buflen,
8894 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008896 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008897 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 if (PyUnicode_GET_SIZE(v) == 1) {
8899 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8900 buf[1] = '\0';
8901 return 1;
8902 }
8903#ifndef Py_UNICODE_WIDE
8904 if (PyUnicode_GET_SIZE(v) == 2) {
8905 /* Decode a valid surrogate pair */
8906 int c0 = PyUnicode_AS_UNICODE(v)[0];
8907 int c1 = PyUnicode_AS_UNICODE(v)[1];
8908 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8909 0xDC00 <= c1 && c1 <= 0xDFFF) {
8910 buf[0] = c0;
8911 buf[1] = c1;
8912 buf[2] = '\0';
8913 return 2;
8914 }
8915 }
8916#endif
8917 goto onError;
8918 }
8919 else {
8920 /* Integer input truncated to a character */
8921 long x;
8922 x = PyLong_AsLong(v);
8923 if (x == -1 && PyErr_Occurred())
8924 goto onError;
8925
8926 if (x < 0 || x > 0x10ffff) {
8927 PyErr_SetString(PyExc_OverflowError,
8928 "%c arg not in range(0x110000)");
8929 return -1;
8930 }
8931
8932#ifndef Py_UNICODE_WIDE
8933 if (x > 0xffff) {
8934 x -= 0x10000;
8935 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8936 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8937 return 2;
8938 }
8939#endif
8940 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 buf[1] = '\0';
8942 return 1;
8943 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008944
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008946 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008948 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949}
8950
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008951/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008952 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008953*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008954#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008955
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958{
8959 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008960 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 int args_owned = 0;
8962 PyUnicodeObject *result = NULL;
8963 PyObject *dict = NULL;
8964 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 PyErr_BadInternalCall();
8968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
8970 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008971 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 fmt = PyUnicode_AS_UNICODE(uformat);
8974 fmtcnt = PyUnicode_GET_SIZE(uformat);
8975
8976 reslen = rescnt = fmtcnt + 100;
8977 result = _PyUnicode_New(reslen);
8978 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 res = PyUnicode_AS_UNICODE(result);
8981
8982 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 arglen = PyTuple_Size(args);
8984 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 }
8986 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 arglen = -1;
8988 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008990 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008991 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993
8994 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 if (*fmt != '%') {
8996 if (--rescnt < 0) {
8997 rescnt = fmtcnt + 100;
8998 reslen += rescnt;
8999 if (_PyUnicode_Resize(&result, reslen) < 0)
9000 goto onError;
9001 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9002 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009003 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009005 }
9006 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 /* Got a format specifier */
9008 int flags = 0;
9009 Py_ssize_t width = -1;
9010 int prec = -1;
9011 Py_UNICODE c = '\0';
9012 Py_UNICODE fill;
9013 int isnumok;
9014 PyObject *v = NULL;
9015 PyObject *temp = NULL;
9016 Py_UNICODE *pbuf;
9017 Py_UNICODE sign;
9018 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009019 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 fmt++;
9022 if (*fmt == '(') {
9023 Py_UNICODE *keystart;
9024 Py_ssize_t keylen;
9025 PyObject *key;
9026 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009027
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 if (dict == NULL) {
9029 PyErr_SetString(PyExc_TypeError,
9030 "format requires a mapping");
9031 goto onError;
9032 }
9033 ++fmt;
9034 --fmtcnt;
9035 keystart = fmt;
9036 /* Skip over balanced parentheses */
9037 while (pcount > 0 && --fmtcnt >= 0) {
9038 if (*fmt == ')')
9039 --pcount;
9040 else if (*fmt == '(')
9041 ++pcount;
9042 fmt++;
9043 }
9044 keylen = fmt - keystart - 1;
9045 if (fmtcnt < 0 || pcount > 0) {
9046 PyErr_SetString(PyExc_ValueError,
9047 "incomplete format key");
9048 goto onError;
9049 }
9050#if 0
9051 /* keys are converted to strings using UTF-8 and
9052 then looked up since Python uses strings to hold
9053 variables names etc. in its namespaces and we
9054 wouldn't want to break common idioms. */
9055 key = PyUnicode_EncodeUTF8(keystart,
9056 keylen,
9057 NULL);
9058#else
9059 key = PyUnicode_FromUnicode(keystart, keylen);
9060#endif
9061 if (key == NULL)
9062 goto onError;
9063 if (args_owned) {
9064 Py_DECREF(args);
9065 args_owned = 0;
9066 }
9067 args = PyObject_GetItem(dict, key);
9068 Py_DECREF(key);
9069 if (args == NULL) {
9070 goto onError;
9071 }
9072 args_owned = 1;
9073 arglen = -1;
9074 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 while (--fmtcnt >= 0) {
9077 switch (c = *fmt++) {
9078 case '-': flags |= F_LJUST; continue;
9079 case '+': flags |= F_SIGN; continue;
9080 case ' ': flags |= F_BLANK; continue;
9081 case '#': flags |= F_ALT; continue;
9082 case '0': flags |= F_ZERO; continue;
9083 }
9084 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 if (c == '*') {
9087 v = getnextarg(args, arglen, &argidx);
9088 if (v == NULL)
9089 goto onError;
9090 if (!PyLong_Check(v)) {
9091 PyErr_SetString(PyExc_TypeError,
9092 "* wants int");
9093 goto onError;
9094 }
9095 width = PyLong_AsLong(v);
9096 if (width == -1 && PyErr_Occurred())
9097 goto onError;
9098 if (width < 0) {
9099 flags |= F_LJUST;
9100 width = -width;
9101 }
9102 if (--fmtcnt >= 0)
9103 c = *fmt++;
9104 }
9105 else if (c >= '0' && c <= '9') {
9106 width = c - '0';
9107 while (--fmtcnt >= 0) {
9108 c = *fmt++;
9109 if (c < '0' || c > '9')
9110 break;
9111 if ((width*10) / 10 != width) {
9112 PyErr_SetString(PyExc_ValueError,
9113 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009114 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 }
9116 width = width*10 + (c - '0');
9117 }
9118 }
9119 if (c == '.') {
9120 prec = 0;
9121 if (--fmtcnt >= 0)
9122 c = *fmt++;
9123 if (c == '*') {
9124 v = getnextarg(args, arglen, &argidx);
9125 if (v == NULL)
9126 goto onError;
9127 if (!PyLong_Check(v)) {
9128 PyErr_SetString(PyExc_TypeError,
9129 "* wants int");
9130 goto onError;
9131 }
9132 prec = PyLong_AsLong(v);
9133 if (prec == -1 && PyErr_Occurred())
9134 goto onError;
9135 if (prec < 0)
9136 prec = 0;
9137 if (--fmtcnt >= 0)
9138 c = *fmt++;
9139 }
9140 else if (c >= '0' && c <= '9') {
9141 prec = c - '0';
9142 while (--fmtcnt >= 0) {
9143 c = Py_CHARMASK(*fmt++);
9144 if (c < '0' || c > '9')
9145 break;
9146 if ((prec*10) / 10 != prec) {
9147 PyErr_SetString(PyExc_ValueError,
9148 "prec too big");
9149 goto onError;
9150 }
9151 prec = prec*10 + (c - '0');
9152 }
9153 }
9154 } /* prec */
9155 if (fmtcnt >= 0) {
9156 if (c == 'h' || c == 'l' || c == 'L') {
9157 if (--fmtcnt >= 0)
9158 c = *fmt++;
9159 }
9160 }
9161 if (fmtcnt < 0) {
9162 PyErr_SetString(PyExc_ValueError,
9163 "incomplete format");
9164 goto onError;
9165 }
9166 if (c != '%') {
9167 v = getnextarg(args, arglen, &argidx);
9168 if (v == NULL)
9169 goto onError;
9170 }
9171 sign = 0;
9172 fill = ' ';
9173 switch (c) {
9174
9175 case '%':
9176 pbuf = formatbuf;
9177 /* presume that buffer length is at least 1 */
9178 pbuf[0] = '%';
9179 len = 1;
9180 break;
9181
9182 case 's':
9183 case 'r':
9184 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009185 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 temp = v;
9187 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009188 }
9189 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (c == 's')
9191 temp = PyObject_Str(v);
9192 else if (c == 'r')
9193 temp = PyObject_Repr(v);
9194 else
9195 temp = PyObject_ASCII(v);
9196 if (temp == NULL)
9197 goto onError;
9198 if (PyUnicode_Check(temp))
9199 /* nothing to do */;
9200 else {
9201 Py_DECREF(temp);
9202 PyErr_SetString(PyExc_TypeError,
9203 "%s argument has non-string str()");
9204 goto onError;
9205 }
9206 }
9207 pbuf = PyUnicode_AS_UNICODE(temp);
9208 len = PyUnicode_GET_SIZE(temp);
9209 if (prec >= 0 && len > prec)
9210 len = prec;
9211 break;
9212
9213 case 'i':
9214 case 'd':
9215 case 'u':
9216 case 'o':
9217 case 'x':
9218 case 'X':
9219 if (c == 'i')
9220 c = 'd';
9221 isnumok = 0;
9222 if (PyNumber_Check(v)) {
9223 PyObject *iobj=NULL;
9224
9225 if (PyLong_Check(v)) {
9226 iobj = v;
9227 Py_INCREF(iobj);
9228 }
9229 else {
9230 iobj = PyNumber_Long(v);
9231 }
9232 if (iobj!=NULL) {
9233 if (PyLong_Check(iobj)) {
9234 isnumok = 1;
9235 temp = formatlong(iobj, flags, prec, c);
9236 Py_DECREF(iobj);
9237 if (!temp)
9238 goto onError;
9239 pbuf = PyUnicode_AS_UNICODE(temp);
9240 len = PyUnicode_GET_SIZE(temp);
9241 sign = 1;
9242 }
9243 else {
9244 Py_DECREF(iobj);
9245 }
9246 }
9247 }
9248 if (!isnumok) {
9249 PyErr_Format(PyExc_TypeError,
9250 "%%%c format: a number is required, "
9251 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9252 goto onError;
9253 }
9254 if (flags & F_ZERO)
9255 fill = '0';
9256 break;
9257
9258 case 'e':
9259 case 'E':
9260 case 'f':
9261 case 'F':
9262 case 'g':
9263 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009264 temp = formatfloat(v, flags, prec, c);
9265 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009266 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009267 pbuf = PyUnicode_AS_UNICODE(temp);
9268 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 sign = 1;
9270 if (flags & F_ZERO)
9271 fill = '0';
9272 break;
9273
9274 case 'c':
9275 pbuf = formatbuf;
9276 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9277 if (len < 0)
9278 goto onError;
9279 break;
9280
9281 default:
9282 PyErr_Format(PyExc_ValueError,
9283 "unsupported format character '%c' (0x%x) "
9284 "at index %zd",
9285 (31<=c && c<=126) ? (char)c : '?',
9286 (int)c,
9287 (Py_ssize_t)(fmt - 1 -
9288 PyUnicode_AS_UNICODE(uformat)));
9289 goto onError;
9290 }
9291 if (sign) {
9292 if (*pbuf == '-' || *pbuf == '+') {
9293 sign = *pbuf++;
9294 len--;
9295 }
9296 else if (flags & F_SIGN)
9297 sign = '+';
9298 else if (flags & F_BLANK)
9299 sign = ' ';
9300 else
9301 sign = 0;
9302 }
9303 if (width < len)
9304 width = len;
9305 if (rescnt - (sign != 0) < width) {
9306 reslen -= rescnt;
9307 rescnt = width + fmtcnt + 100;
9308 reslen += rescnt;
9309 if (reslen < 0) {
9310 Py_XDECREF(temp);
9311 PyErr_NoMemory();
9312 goto onError;
9313 }
9314 if (_PyUnicode_Resize(&result, reslen) < 0) {
9315 Py_XDECREF(temp);
9316 goto onError;
9317 }
9318 res = PyUnicode_AS_UNICODE(result)
9319 + reslen - rescnt;
9320 }
9321 if (sign) {
9322 if (fill != ' ')
9323 *res++ = sign;
9324 rescnt--;
9325 if (width > len)
9326 width--;
9327 }
9328 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9329 assert(pbuf[0] == '0');
9330 assert(pbuf[1] == c);
9331 if (fill != ' ') {
9332 *res++ = *pbuf++;
9333 *res++ = *pbuf++;
9334 }
9335 rescnt -= 2;
9336 width -= 2;
9337 if (width < 0)
9338 width = 0;
9339 len -= 2;
9340 }
9341 if (width > len && !(flags & F_LJUST)) {
9342 do {
9343 --rescnt;
9344 *res++ = fill;
9345 } while (--width > len);
9346 }
9347 if (fill == ' ') {
9348 if (sign)
9349 *res++ = sign;
9350 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9351 assert(pbuf[0] == '0');
9352 assert(pbuf[1] == c);
9353 *res++ = *pbuf++;
9354 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355 }
9356 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 Py_UNICODE_COPY(res, pbuf, len);
9358 res += len;
9359 rescnt -= len;
9360 while (--width >= len) {
9361 --rescnt;
9362 *res++ = ' ';
9363 }
9364 if (dict && (argidx < arglen) && c != '%') {
9365 PyErr_SetString(PyExc_TypeError,
9366 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009367 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 goto onError;
9369 }
9370 Py_XDECREF(temp);
9371 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 } /* until end */
9373 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 PyErr_SetString(PyExc_TypeError,
9375 "not all arguments converted during string formatting");
9376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 }
9378
Thomas Woutersa96affe2006-03-12 00:29:36 +00009379 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 }
9384 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 return (PyObject *)result;
9386
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 Py_XDECREF(result);
9389 Py_DECREF(uformat);
9390 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
9393 return NULL;
9394}
9395
Jeremy Hylton938ace62002-07-17 16:30:39 +00009396static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009397unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9398
Tim Peters6d6c1a32001-08-02 04:15:00 +00009399static PyObject *
9400unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9401{
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009403 static char *kwlist[] = {"object", "encoding", "errors", 0};
9404 char *encoding = NULL;
9405 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009406
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 if (type != &PyUnicode_Type)
9408 return unicode_subtype_new(type, args, kwds);
9409 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009411 return NULL;
9412 if (x == NULL)
9413 return (PyObject *)_PyUnicode_New(0);
9414 if (encoding == NULL && errors == NULL)
9415 return PyObject_Str(x);
9416 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009418}
9419
Guido van Rossume023fe02001-08-30 03:12:59 +00009420static PyObject *
9421unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9422{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009423 PyUnicodeObject *tmp, *pnew;
9424 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009425
Benjamin Peterson14339b62009-01-31 16:36:08 +00009426 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9427 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9428 if (tmp == NULL)
9429 return NULL;
9430 assert(PyUnicode_Check(tmp));
9431 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9432 if (pnew == NULL) {
9433 Py_DECREF(tmp);
9434 return NULL;
9435 }
9436 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9437 if (pnew->str == NULL) {
9438 _Py_ForgetReference((PyObject *)pnew);
9439 PyObject_Del(pnew);
9440 Py_DECREF(tmp);
9441 return PyErr_NoMemory();
9442 }
9443 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9444 pnew->length = n;
9445 pnew->hash = tmp->hash;
9446 Py_DECREF(tmp);
9447 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009448}
9449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009450PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009452\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009453Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009454encoding defaults to the current default string encoding.\n\
9455errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009456
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009457static PyObject *unicode_iter(PyObject *seq);
9458
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009460 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009461 "str", /* tp_name */
9462 sizeof(PyUnicodeObject), /* tp_size */
9463 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009465 (destructor)unicode_dealloc, /* tp_dealloc */
9466 0, /* tp_print */
9467 0, /* tp_getattr */
9468 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009469 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009470 unicode_repr, /* tp_repr */
9471 &unicode_as_number, /* tp_as_number */
9472 &unicode_as_sequence, /* tp_as_sequence */
9473 &unicode_as_mapping, /* tp_as_mapping */
9474 (hashfunc) unicode_hash, /* tp_hash*/
9475 0, /* tp_call*/
9476 (reprfunc) unicode_str, /* tp_str */
9477 PyObject_GenericGetAttr, /* tp_getattro */
9478 0, /* tp_setattro */
9479 0, /* tp_as_buffer */
9480 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009482 unicode_doc, /* tp_doc */
9483 0, /* tp_traverse */
9484 0, /* tp_clear */
9485 PyUnicode_RichCompare, /* tp_richcompare */
9486 0, /* tp_weaklistoffset */
9487 unicode_iter, /* tp_iter */
9488 0, /* tp_iternext */
9489 unicode_methods, /* tp_methods */
9490 0, /* tp_members */
9491 0, /* tp_getset */
9492 &PyBaseObject_Type, /* tp_base */
9493 0, /* tp_dict */
9494 0, /* tp_descr_get */
9495 0, /* tp_descr_set */
9496 0, /* tp_dictoffset */
9497 0, /* tp_init */
9498 0, /* tp_alloc */
9499 unicode_new, /* tp_new */
9500 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501};
9502
9503/* Initialize the Unicode implementation */
9504
Thomas Wouters78890102000-07-22 19:25:51 +00009505void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009507 int i;
9508
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509 /* XXX - move this array to unicodectype.c ? */
9510 Py_UNICODE linebreak[] = {
9511 0x000A, /* LINE FEED */
9512 0x000D, /* CARRIAGE RETURN */
9513 0x001C, /* FILE SEPARATOR */
9514 0x001D, /* GROUP SEPARATOR */
9515 0x001E, /* RECORD SEPARATOR */
9516 0x0085, /* NEXT LINE */
9517 0x2028, /* LINE SEPARATOR */
9518 0x2029, /* PARAGRAPH SEPARATOR */
9519 };
9520
Fred Drakee4315f52000-05-09 19:53:39 +00009521 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009522 free_list = NULL;
9523 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009525 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009527
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009528 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009530 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009532
9533 /* initialize the linebreak bloom filter */
9534 bloom_linebreak = make_bloom_mask(
9535 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9536 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009537
9538 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539}
9540
9541/* Finalize the Unicode implementation */
9542
Christian Heimesa156e092008-02-16 07:38:31 +00009543int
9544PyUnicode_ClearFreeList(void)
9545{
9546 int freelist_size = numfree;
9547 PyUnicodeObject *u;
9548
9549 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 PyUnicodeObject *v = u;
9551 u = *(PyUnicodeObject **)u;
9552 if (v->str)
9553 PyObject_DEL(v->str);
9554 Py_XDECREF(v->defenc);
9555 PyObject_Del(v);
9556 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009557 }
9558 free_list = NULL;
9559 assert(numfree == 0);
9560 return freelist_size;
9561}
9562
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563void
Thomas Wouters78890102000-07-22 19:25:51 +00009564_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009566 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009568 Py_XDECREF(unicode_empty);
9569 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009570
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009571 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 if (unicode_latin1[i]) {
9573 Py_DECREF(unicode_latin1[i]);
9574 unicode_latin1[i] = NULL;
9575 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009576 }
Christian Heimesa156e092008-02-16 07:38:31 +00009577 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009579
Walter Dörwald16807132007-05-25 13:52:07 +00009580void
9581PyUnicode_InternInPlace(PyObject **p)
9582{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9584 PyObject *t;
9585 if (s == NULL || !PyUnicode_Check(s))
9586 Py_FatalError(
9587 "PyUnicode_InternInPlace: unicode strings only please!");
9588 /* If it's a subclass, we don't really know what putting
9589 it in the interned dict might do. */
9590 if (!PyUnicode_CheckExact(s))
9591 return;
9592 if (PyUnicode_CHECK_INTERNED(s))
9593 return;
9594 if (interned == NULL) {
9595 interned = PyDict_New();
9596 if (interned == NULL) {
9597 PyErr_Clear(); /* Don't leave an exception */
9598 return;
9599 }
9600 }
9601 /* It might be that the GetItem call fails even
9602 though the key is present in the dictionary,
9603 namely when this happens during a stack overflow. */
9604 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009606 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009607
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 if (t) {
9609 Py_INCREF(t);
9610 Py_DECREF(*p);
9611 *p = t;
9612 return;
9613 }
Walter Dörwald16807132007-05-25 13:52:07 +00009614
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 PyThreadState_GET()->recursion_critical = 1;
9616 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9617 PyErr_Clear();
9618 PyThreadState_GET()->recursion_critical = 0;
9619 return;
9620 }
9621 PyThreadState_GET()->recursion_critical = 0;
9622 /* The two references in interned are not counted by refcnt.
9623 The deallocator will take care of this */
9624 Py_REFCNT(s) -= 2;
9625 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009626}
9627
9628void
9629PyUnicode_InternImmortal(PyObject **p)
9630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 PyUnicode_InternInPlace(p);
9632 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9633 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9634 Py_INCREF(*p);
9635 }
Walter Dörwald16807132007-05-25 13:52:07 +00009636}
9637
9638PyObject *
9639PyUnicode_InternFromString(const char *cp)
9640{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009641 PyObject *s = PyUnicode_FromString(cp);
9642 if (s == NULL)
9643 return NULL;
9644 PyUnicode_InternInPlace(&s);
9645 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009646}
9647
9648void _Py_ReleaseInternedUnicodeStrings(void)
9649{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009650 PyObject *keys;
9651 PyUnicodeObject *s;
9652 Py_ssize_t i, n;
9653 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009654
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 if (interned == NULL || !PyDict_Check(interned))
9656 return;
9657 keys = PyDict_Keys(interned);
9658 if (keys == NULL || !PyList_Check(keys)) {
9659 PyErr_Clear();
9660 return;
9661 }
Walter Dörwald16807132007-05-25 13:52:07 +00009662
Benjamin Peterson14339b62009-01-31 16:36:08 +00009663 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9664 detector, interned unicode strings are not forcibly deallocated;
9665 rather, we give them their stolen references back, and then clear
9666 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009667
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 n = PyList_GET_SIZE(keys);
9669 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009671 for (i = 0; i < n; i++) {
9672 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9673 switch (s->state) {
9674 case SSTATE_NOT_INTERNED:
9675 /* XXX Shouldn't happen */
9676 break;
9677 case SSTATE_INTERNED_IMMORTAL:
9678 Py_REFCNT(s) += 1;
9679 immortal_size += s->length;
9680 break;
9681 case SSTATE_INTERNED_MORTAL:
9682 Py_REFCNT(s) += 2;
9683 mortal_size += s->length;
9684 break;
9685 default:
9686 Py_FatalError("Inconsistent interned string state.");
9687 }
9688 s->state = SSTATE_NOT_INTERNED;
9689 }
9690 fprintf(stderr, "total size of all interned strings: "
9691 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9692 "mortal/immortal\n", mortal_size, immortal_size);
9693 Py_DECREF(keys);
9694 PyDict_Clear(interned);
9695 Py_DECREF(interned);
9696 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009697}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009698
9699
9700/********************* Unicode Iterator **************************/
9701
9702typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 PyObject_HEAD
9704 Py_ssize_t it_index;
9705 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009706} unicodeiterobject;
9707
9708static void
9709unicodeiter_dealloc(unicodeiterobject *it)
9710{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009711 _PyObject_GC_UNTRACK(it);
9712 Py_XDECREF(it->it_seq);
9713 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009714}
9715
9716static int
9717unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009719 Py_VISIT(it->it_seq);
9720 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009721}
9722
9723static PyObject *
9724unicodeiter_next(unicodeiterobject *it)
9725{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009726 PyUnicodeObject *seq;
9727 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009728
Benjamin Peterson14339b62009-01-31 16:36:08 +00009729 assert(it != NULL);
9730 seq = it->it_seq;
9731 if (seq == NULL)
9732 return NULL;
9733 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009734
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9736 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009738 if (item != NULL)
9739 ++it->it_index;
9740 return item;
9741 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009742
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 Py_DECREF(seq);
9744 it->it_seq = NULL;
9745 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009746}
9747
9748static PyObject *
9749unicodeiter_len(unicodeiterobject *it)
9750{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009751 Py_ssize_t len = 0;
9752 if (it->it_seq)
9753 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9754 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009755}
9756
9757PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9758
9759static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009760 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009762 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009763};
9764
9765PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9767 "str_iterator", /* tp_name */
9768 sizeof(unicodeiterobject), /* tp_basicsize */
9769 0, /* tp_itemsize */
9770 /* methods */
9771 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9772 0, /* tp_print */
9773 0, /* tp_getattr */
9774 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009775 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009776 0, /* tp_repr */
9777 0, /* tp_as_number */
9778 0, /* tp_as_sequence */
9779 0, /* tp_as_mapping */
9780 0, /* tp_hash */
9781 0, /* tp_call */
9782 0, /* tp_str */
9783 PyObject_GenericGetAttr, /* tp_getattro */
9784 0, /* tp_setattro */
9785 0, /* tp_as_buffer */
9786 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9787 0, /* tp_doc */
9788 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9789 0, /* tp_clear */
9790 0, /* tp_richcompare */
9791 0, /* tp_weaklistoffset */
9792 PyObject_SelfIter, /* tp_iter */
9793 (iternextfunc)unicodeiter_next, /* tp_iternext */
9794 unicodeiter_methods, /* tp_methods */
9795 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009796};
9797
9798static PyObject *
9799unicode_iter(PyObject *seq)
9800{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009801 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009802
Benjamin Peterson14339b62009-01-31 16:36:08 +00009803 if (!PyUnicode_Check(seq)) {
9804 PyErr_BadInternalCall();
9805 return NULL;
9806 }
9807 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9808 if (it == NULL)
9809 return NULL;
9810 it->it_index = 0;
9811 Py_INCREF(seq);
9812 it->it_seq = (PyUnicodeObject *)seq;
9813 _PyObject_GC_TRACK(it);
9814 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009815}
9816
Martin v. Löwis5b222132007-06-10 09:51:05 +00009817size_t
9818Py_UNICODE_strlen(const Py_UNICODE *u)
9819{
9820 int res = 0;
9821 while(*u++)
9822 res++;
9823 return res;
9824}
9825
9826Py_UNICODE*
9827Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9828{
9829 Py_UNICODE *u = s1;
9830 while ((*u++ = *s2++));
9831 return s1;
9832}
9833
9834Py_UNICODE*
9835Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9836{
9837 Py_UNICODE *u = s1;
9838 while ((*u++ = *s2++))
9839 if (n-- == 0)
9840 break;
9841 return s1;
9842}
9843
9844int
9845Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9846{
9847 while (*s1 && *s2 && *s1 == *s2)
9848 s1++, s2++;
9849 if (*s1 && *s2)
9850 return (*s1 < *s2) ? -1 : +1;
9851 if (*s1)
9852 return 1;
9853 if (*s2)
9854 return -1;
9855 return 0;
9856}
9857
9858Py_UNICODE*
9859Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9860{
9861 const Py_UNICODE *p;
9862 for (p = s; *p; p++)
9863 if (*p == c)
9864 return (Py_UNICODE*)p;
9865 return NULL;
9866}
9867
9868
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009869#ifdef __cplusplus
9870}
9871#endif