blob: 23b322f3043bcdc6f87a967d427799fa915eb5ed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 Py_ssize_t size,
1298 const char *encoding,
1299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001302 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 char lower[20]; /* Enough for any encoding name we recognize */
1304 char *l;
1305 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
1307 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 encoding = PyUnicode_GetDefaultEncoding();
1309
1310 /* Convert encoding to lower case and replace '_' with '-' in order to
1311 catch e.g. UTF_8 */
1312 e = encoding;
1313 l = lower;
1314 while (*e && l < &lower[(sizeof lower) - 2]) {
1315 if (ISUPPER(*e)) {
1316 *l++ = TOLOWER(*e++);
1317 }
1318 else if (*e == '_') {
1319 *l++ = '-';
1320 e++;
1321 }
1322 else {
1323 *l++ = *e++;
1324 }
1325 }
1326 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001327
1328 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001331 else if ((strcmp(lower, "latin-1") == 0) ||
1332 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001334#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001335 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336 return PyUnicode_DecodeMBCS(s, size, errors);
1337#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001339 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001340 else if (strcmp(lower, "utf-16") == 0)
1341 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1342 else if (strcmp(lower, "utf-32") == 0)
1343 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001346 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001347 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001348 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001349 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 if (buffer == NULL)
1351 goto onError;
1352 unicode = PyCodec_Decode(buffer, encoding, errors);
1353 if (unicode == NULL)
1354 goto onError;
1355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001357 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001358 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 Py_DECREF(unicode);
1360 goto onError;
1361 }
1362 Py_DECREF(buffer);
1363 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 Py_XDECREF(buffer);
1367 return NULL;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Decode via the codec registry */
1385 v = PyCodec_Decode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
1399
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404
1405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407
1408 /* Decode via the codec registry */
1409 v = PyCodec_Decode(unicode, encoding, errors);
1410 if (v == NULL)
1411 goto onError;
1412 if (!PyUnicode_Check(v)) {
1413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001414 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001415 Py_TYPE(v)->tp_name);
1416 Py_DECREF(v);
1417 goto onError;
1418 }
1419 return v;
1420
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422 return NULL;
1423}
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 Py_ssize_t size,
1427 const char *encoding,
1428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 unicode = PyUnicode_FromUnicode(s, size);
1433 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1436 Py_DECREF(unicode);
1437 return v;
1438}
1439
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001440PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1441 const char *encoding,
1442 const char *errors)
1443{
1444 PyObject *v;
1445
1446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 goto onError;
1449 }
1450
1451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453
1454 /* Encode via the codec registry */
1455 v = PyCodec_Encode(unicode, encoding, errors);
1456 if (v == NULL)
1457 goto onError;
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001461 return NULL;
1462}
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1465 const char *encoding,
1466 const char *errors)
1467{
1468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 }
Fred Drakee4315f52000-05-09 19:53:39 +00001474
Tim Petersced69f82003-09-16 20:30:58 +00001475 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001476 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001477
1478 /* Shortcuts for common default encodings */
1479 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 if (strcmp(encoding, "utf-8") == 0)
1481 return PyUnicode_AsUTF8String(unicode);
1482 else if (strcmp(encoding, "latin-1") == 0)
1483 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001484#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 else if (strcmp(encoding, "mbcs") == 0)
1486 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 else if (strcmp(encoding, "ascii") == 0)
1489 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001490 /* During bootstrap, we may need to find the encodings
1491 package, to load the file system encoding, and require the
1492 file system encoding in order to load the encodings
1493 package.
1494
1495 Break out of this dependency by assuming that the path to
1496 the encodings module is ASCII-only. XXX could try wcstombs
1497 instead, if the file system encoding is the locale's
1498 encoding. */
1499 else if (Py_FileSystemDefaultEncoding &&
1500 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1501 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001502 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504
1505 /* Encode via the codec registry */
1506 v = PyCodec_Encode(unicode, encoding, errors);
1507 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001508 return NULL;
1509
1510 /* The normal path */
1511 if (PyBytes_Check(v))
1512 return v;
1513
1514 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001515 if (PyByteArray_Check(v)) {
1516 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001517 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001518 PyOS_snprintf(msg, sizeof(msg),
1519 "encoder %s returned buffer instead of bytes",
1520 encoding);
1521 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001522 Py_DECREF(v);
1523 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001524 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001525
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001526 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1527 Py_DECREF(v);
1528 return b;
1529 }
1530
1531 PyErr_Format(PyExc_TypeError,
1532 "encoder did not return a bytes object (type=%.400s)",
1533 Py_TYPE(v)->tp_name);
1534 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535 return NULL;
1536}
1537
1538PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1539 const char *encoding,
1540 const char *errors)
1541{
1542 PyObject *v;
1543
1544 if (!PyUnicode_Check(unicode)) {
1545 PyErr_BadArgument();
1546 goto onError;
1547 }
1548
1549 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001550 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551
1552 /* Encode via the codec registry */
1553 v = PyCodec_Encode(unicode, encoding, errors);
1554 if (v == NULL)
1555 goto onError;
1556 if (!PyUnicode_Check(v)) {
1557 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001558 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 Py_TYPE(v)->tp_name);
1560 Py_DECREF(v);
1561 goto onError;
1562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001564
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 return NULL;
1567}
1568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001569PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001571{
1572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001573 if (v)
1574 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001575 if (errors != NULL)
1576 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001578 PyUnicode_GET_SIZE(unicode),
1579 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001580 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001581 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001582 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001583 return v;
1584}
1585
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001586PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001587PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001588 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001589 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1590}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001591
Christian Heimes5894ba72007-11-04 11:43:14 +00001592PyObject*
1593PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1594{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001595 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1596 can be undefined. If it is case, decode using UTF-8. The following assumes
1597 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1598 bootstrapping process where the codecs aren't ready yet.
1599 */
1600 if (Py_FileSystemDefaultEncoding) {
1601#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001602 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001603 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001604 }
1605#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001606 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001607 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001608 }
1609#endif
1610 return PyUnicode_Decode(s, size,
1611 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001612 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001613 }
1614 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001615 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001616 }
1617}
1618
Martin v. Löwis011e8422009-05-05 04:43:17 +00001619/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001620 system encoding. The addr param must be a PyObject**.
1621 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001622
1623int
1624PyUnicode_FSConverter(PyObject* arg, void* addr)
1625{
1626 PyObject *output = NULL;
1627 Py_ssize_t size;
1628 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001629 if (arg == NULL) {
1630 Py_DECREF(*(PyObject**)addr);
1631 return 1;
1632 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001633 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001634 output = arg;
1635 Py_INCREF(output);
1636 }
1637 else {
1638 arg = PyUnicode_FromObject(arg);
1639 if (!arg)
1640 return 0;
Victor Stinner0ea2a462010-04-30 00:22:08 +00001641 output = PyUnicode_AsEncodedObject(arg,
Martin v. Löwis011e8422009-05-05 04:43:17 +00001642 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001643 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001644 Py_DECREF(arg);
1645 if (!output)
1646 return 0;
1647 if (!PyBytes_Check(output)) {
1648 Py_DECREF(output);
1649 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1650 return 0;
1651 }
1652 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001653 size = PyBytes_GET_SIZE(output);
1654 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001655 if (size != strlen(data)) {
1656 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1657 Py_DECREF(output);
1658 return 0;
1659 }
1660 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001661 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001662}
1663
1664
Martin v. Löwis5b222132007-06-10 09:51:05 +00001665char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001666_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001667{
Christian Heimesf3863112007-11-22 07:46:41 +00001668 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
1671 return NULL;
1672 }
Christian Heimesf3863112007-11-22 07:46:41 +00001673 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1674 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001675 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001676 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001677 *psize = PyBytes_GET_SIZE(bytes);
1678 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001679}
1680
1681char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001682_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001683{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001684 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001685}
1686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1688{
1689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
1691 goto onError;
1692 }
1693 return PyUnicode_AS_UNICODE(unicode);
1694
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 return NULL;
1697}
1698
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700{
1701 if (!PyUnicode_Check(unicode)) {
1702 PyErr_BadArgument();
1703 goto onError;
1704 }
1705 return PyUnicode_GET_SIZE(unicode);
1706
Benjamin Peterson29060642009-01-31 22:14:21 +00001707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 return -1;
1709}
1710
Thomas Wouters78890102000-07-22 19:25:51 +00001711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001712{
1713 return unicode_default_encoding;
1714}
1715
1716int PyUnicode_SetDefaultEncoding(const char *encoding)
1717{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001718 if (strcmp(encoding, unicode_default_encoding) != 0) {
1719 PyErr_Format(PyExc_ValueError,
1720 "Can only set default encoding to %s",
1721 unicode_default_encoding);
1722 return -1;
1723 }
Fred Drakee4315f52000-05-09 19:53:39 +00001724 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001725}
1726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727/* error handling callback helper:
1728 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001729 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 and adjust various state variables.
1731 return 0 on success, -1 on error
1732*/
1733
1734static
1735int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001736 const char *encoding, const char *reason,
1737 const char **input, const char **inend, Py_ssize_t *startinpos,
1738 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1739 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001741 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742
1743 PyObject *restuple = NULL;
1744 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001746 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001747 Py_ssize_t requiredsize;
1748 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001750 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001751 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int res = -1;
1753
1754 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 *errorHandler = PyCodec_LookupError(errors);
1756 if (*errorHandler == NULL)
1757 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 }
1759
1760 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001762 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1763 if (*exceptionObject == NULL)
1764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 }
1766 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1768 goto onError;
1769 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1770 goto onError;
1771 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1772 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 }
1774
1775 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1776 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001779 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 }
1782 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001784
1785 /* Copy back the bytes variables, which might have been modified by the
1786 callback */
1787 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1788 if (!inputobj)
1789 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001790 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001793 *input = PyBytes_AS_STRING(inputobj);
1794 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001795 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001796 /* we can DECREF safely, as the exception has another reference,
1797 so the object won't go away. */
1798 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001802 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001803 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1804 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806
1807 /* need more space? (at least enough for what we
1808 have+the replacement+the rest of the string (starting
1809 at the new input position), so we won't have to check space
1810 when there are no errors in the rest of the string) */
1811 repptr = PyUnicode_AS_UNICODE(repunicode);
1812 repsize = PyUnicode_GET_SIZE(repunicode);
1813 requiredsize = *outpos + repsize + insize-newpos;
1814 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 if (requiredsize<2*outsize)
1816 requiredsize = 2*outsize;
1817 if (_PyUnicode_Resize(output, requiredsize) < 0)
1818 goto onError;
1819 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 }
1821 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001822 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 Py_UNICODE_COPY(*outptr, repptr, repsize);
1824 *outptr += repsize;
1825 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 /* we made it! */
1828 res = 0;
1829
Benjamin Peterson29060642009-01-31 22:14:21 +00001830 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 Py_XDECREF(restuple);
1832 return res;
1833}
1834
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835/* --- UTF-7 Codec -------------------------------------------------------- */
1836
Antoine Pitrou244651a2009-05-04 18:56:13 +00001837/* See RFC2152 for details. We encode conservatively and decode liberally. */
1838
1839/* Three simple macros defining base-64. */
1840
1841/* Is c a base-64 character? */
1842
1843#define IS_BASE64(c) \
1844 (((c) >= 'A' && (c) <= 'Z') || \
1845 ((c) >= 'a' && (c) <= 'z') || \
1846 ((c) >= '0' && (c) <= '9') || \
1847 (c) == '+' || (c) == '/')
1848
1849/* given that c is a base-64 character, what is its base-64 value? */
1850
1851#define FROM_BASE64(c) \
1852 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1853 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1854 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1855 (c) == '+' ? 62 : 63)
1856
1857/* What is the base-64 character of the bottom 6 bits of n? */
1858
1859#define TO_BASE64(n) \
1860 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1861
1862/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1863 * decoded as itself. We are permissive on decoding; the only ASCII
1864 * byte not decoding to itself is the + which begins a base64
1865 * string. */
1866
1867#define DECODE_DIRECT(c) \
1868 ((c) <= 127 && (c) != '+')
1869
1870/* The UTF-7 encoder treats ASCII characters differently according to
1871 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1872 * the above). See RFC2152. This array identifies these different
1873 * sets:
1874 * 0 : "Set D"
1875 * alphanumeric and '(),-./:?
1876 * 1 : "Set O"
1877 * !"#$%&*;<=>@[]^_`{|}
1878 * 2 : "whitespace"
1879 * ht nl cr sp
1880 * 3 : special (must be base64 encoded)
1881 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1882 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883
Tim Petersced69f82003-09-16 20:30:58 +00001884static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001885char utf7_category[128] = {
1886/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1887 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1888/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1889 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1890/* sp ! " # $ % & ' ( ) * + , - . / */
1891 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1892/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1894/* @ A B C D E F G H I J K L M N O */
1895 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1896/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1898/* ` a b c d e f g h i j k l m n o */
1899 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900/* p q r s t u v w x y z { | } ~ del */
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001902};
1903
Antoine Pitrou244651a2009-05-04 18:56:13 +00001904/* ENCODE_DIRECT: this character should be encoded as itself. The
1905 * answer depends on whether we are encoding set O as itself, and also
1906 * on whether we are encoding whitespace as itself. RFC2152 makes it
1907 * clear that the answers to these questions vary between
1908 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001909
Antoine Pitrou244651a2009-05-04 18:56:13 +00001910#define ENCODE_DIRECT(c, directO, directWS) \
1911 ((c) < 128 && (c) > 0 && \
1912 ((utf7_category[(c)] == 0) || \
1913 (directWS && (utf7_category[(c)] == 2)) || \
1914 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 Py_ssize_t size,
1918 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001919{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001920 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1921}
1922
Antoine Pitrou244651a2009-05-04 18:56:13 +00001923/* The decoder. The only state we preserve is our read position,
1924 * i.e. how many characters we have consumed. So if we end in the
1925 * middle of a shift sequence we have to back off the read position
1926 * and the output to the beginning of the sequence, otherwise we lose
1927 * all the shift state (seen bits, number of bits seen, high
1928 * surrogate). */
1929
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001930PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001931 Py_ssize_t size,
1932 const char *errors,
1933 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001936 Py_ssize_t startinpos;
1937 Py_ssize_t endinpos;
1938 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 const char *e;
1940 PyUnicodeObject *unicode;
1941 Py_UNICODE *p;
1942 const char *errmsg = "";
1943 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001944 Py_UNICODE *shiftOutStart;
1945 unsigned int base64bits = 0;
1946 unsigned long base64buffer = 0;
1947 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 PyObject *errorHandler = NULL;
1949 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950
1951 unicode = _PyUnicode_New(size);
1952 if (!unicode)
1953 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001954 if (size == 0) {
1955 if (consumed)
1956 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959
1960 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001961 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 e = s + size;
1963
1964 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001966 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001967 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968
Antoine Pitrou244651a2009-05-04 18:56:13 +00001969 if (inShift) { /* in a base-64 section */
1970 if (IS_BASE64(ch)) { /* consume a base-64 character */
1971 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1972 base64bits += 6;
1973 s++;
1974 if (base64bits >= 16) {
1975 /* we have enough bits for a UTF-16 value */
1976 Py_UNICODE outCh = (Py_UNICODE)
1977 (base64buffer >> (base64bits-16));
1978 base64bits -= 16;
1979 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1980 if (surrogate) {
1981 /* expecting a second surrogate */
1982 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1983#ifdef Py_UNICODE_WIDE
1984 *p++ = (((surrogate & 0x3FF)<<10)
1985 | (outCh & 0x3FF)) + 0x10000;
1986#else
1987 *p++ = surrogate;
1988 *p++ = outCh;
1989#endif
1990 surrogate = 0;
1991 }
1992 else {
1993 surrogate = 0;
1994 errmsg = "second surrogate missing";
1995 goto utf7Error;
1996 }
1997 }
1998 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1999 /* first surrogate */
2000 surrogate = outCh;
2001 }
2002 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2003 errmsg = "unexpected second surrogate";
2004 goto utf7Error;
2005 }
2006 else {
2007 *p++ = outCh;
2008 }
2009 }
2010 }
2011 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002012 inShift = 0;
2013 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014 if (surrogate) {
2015 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002016 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 if (base64bits > 0) { /* left-over bits */
2019 if (base64bits >= 6) {
2020 /* We've seen at least one base-64 character */
2021 errmsg = "partial character in shift sequence";
2022 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 else {
2025 /* Some bits remain; they should be zero */
2026 if (base64buffer != 0) {
2027 errmsg = "non-zero padding bits in shift sequence";
2028 goto utf7Error;
2029 }
2030 }
2031 }
2032 if (ch != '-') {
2033 /* '-' is absorbed; other terminating
2034 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035 *p++ = ch;
2036 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 }
2038 }
2039 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 s++; /* consume '+' */
2042 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 s++;
2044 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002045 }
2046 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002048 shiftOutStart = p;
2049 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002050 }
2051 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002052 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 *p++ = ch;
2054 s++;
2055 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002056 else {
2057 startinpos = s-starts;
2058 s++;
2059 errmsg = "unexpected special character";
2060 goto utf7Error;
2061 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002063utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 outpos = p-PyUnicode_AS_UNICODE(unicode);
2065 endinpos = s-starts;
2066 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 errors, &errorHandler,
2068 "utf7", errmsg,
2069 &starts, &e, &startinpos, &endinpos, &exc, &s,
2070 &unicode, &outpos, &p))
2071 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 }
2073
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 /* end of string */
2075
2076 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2077 /* if we're in an inconsistent state, that's an error */
2078 if (surrogate ||
2079 (base64bits >= 6) ||
2080 (base64bits > 0 && base64buffer != 0)) {
2081 outpos = p-PyUnicode_AS_UNICODE(unicode);
2082 endinpos = size;
2083 if (unicode_decode_call_errorhandler(
2084 errors, &errorHandler,
2085 "utf7", "unterminated shift sequence",
2086 &starts, &e, &startinpos, &endinpos, &exc, &s,
2087 &unicode, &outpos, &p))
2088 goto onError;
2089 if (s < e)
2090 goto restart;
2091 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093
2094 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002095 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002096 if (inShift) {
2097 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002098 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 }
2100 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002101 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002102 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002103 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002104
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002105 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002106 goto onError;
2107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108 Py_XDECREF(errorHandler);
2109 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110 return (PyObject *)unicode;
2111
Benjamin Peterson29060642009-01-31 22:14:21 +00002112 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 Py_XDECREF(errorHandler);
2114 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 Py_DECREF(unicode);
2116 return NULL;
2117}
2118
2119
2120PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002122 int base64SetO,
2123 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002126 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002127 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002128 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002130 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131 unsigned int base64bits = 0;
2132 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 char * out;
2134 char * start;
2135
2136 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002138
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002139 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002140 return PyErr_NoMemory();
2141
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 if (v == NULL)
2144 return NULL;
2145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002146 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147 for (;i < size; ++i) {
2148 Py_UNICODE ch = s[i];
2149
Antoine Pitrou244651a2009-05-04 18:56:13 +00002150 if (inShift) {
2151 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2152 /* shifting out */
2153 if (base64bits) { /* output remaining bits */
2154 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2155 base64buffer = 0;
2156 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002157 }
2158 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159 /* Characters not in the BASE64 set implicitly unshift the sequence
2160 so no '-' is required, except if the character is itself a '-' */
2161 if (IS_BASE64(ch) || ch == '-') {
2162 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164 *out++ = (char) ch;
2165 }
2166 else {
2167 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002168 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002170 else { /* not in a shift sequence */
2171 if (ch == '+') {
2172 *out++ = '+';
2173 *out++ = '-';
2174 }
2175 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2176 *out++ = (char) ch;
2177 }
2178 else {
2179 *out++ = '+';
2180 inShift = 1;
2181 goto encode_char;
2182 }
2183 }
2184 continue;
2185encode_char:
2186#ifdef Py_UNICODE_WIDE
2187 if (ch >= 0x10000) {
2188 /* code first surrogate */
2189 base64bits += 16;
2190 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2191 while (base64bits >= 6) {
2192 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2193 base64bits -= 6;
2194 }
2195 /* prepare second surrogate */
2196 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2197 }
2198#endif
2199 base64bits += 16;
2200 base64buffer = (base64buffer << 16) | ch;
2201 while (base64bits >= 6) {
2202 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2203 base64bits -= 6;
2204 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002205 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002206 if (base64bits)
2207 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2208 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002210 if (_PyBytes_Resize(&v, out - start) < 0)
2211 return NULL;
2212 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213}
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215#undef IS_BASE64
2216#undef FROM_BASE64
2217#undef TO_BASE64
2218#undef DECODE_DIRECT
2219#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221/* --- UTF-8 Codec -------------------------------------------------------- */
2222
Tim Petersced69f82003-09-16 20:30:58 +00002223static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224char utf8_code_length[256] = {
2225 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2226 illegal prefix. see RFC 2279 for details */
2227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2239 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2240 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2241 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2242 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2243};
2244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002246 Py_ssize_t size,
2247 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248{
Walter Dörwald69652032004-09-07 20:24:22 +00002249 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2250}
2251
Antoine Pitrouab868312009-01-10 15:40:25 +00002252/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2253#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2254
2255/* Mask to quickly check whether a C 'long' contains a
2256 non-ASCII, UTF8-encoded char. */
2257#if (SIZEOF_LONG == 8)
2258# define ASCII_CHAR_MASK 0x8080808080808080L
2259#elif (SIZEOF_LONG == 4)
2260# define ASCII_CHAR_MASK 0x80808080L
2261#else
2262# error C 'long' size should be either 4 or 8!
2263#endif
2264
Walter Dörwald69652032004-09-07 20:24:22 +00002265PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 Py_ssize_t size,
2267 const char *errors,
2268 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002272 Py_ssize_t startinpos;
2273 Py_ssize_t endinpos;
2274 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002275 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 PyUnicodeObject *unicode;
2277 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002278 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 PyObject *errorHandler = NULL;
2280 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281
2282 /* Note: size will always be longer than the resulting Unicode
2283 character count */
2284 unicode = _PyUnicode_New(size);
2285 if (!unicode)
2286 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002287 if (size == 0) {
2288 if (consumed)
2289 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* Unpack UTF-8 encoded data */
2294 p = unicode->str;
2295 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002296 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297
2298 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002299 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300
2301 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002302 /* Fast path for runs of ASCII characters. Given that common UTF-8
2303 input will consist of an overwhelming majority of ASCII
2304 characters, we try to optimize for this case by checking
2305 as many characters as a C 'long' can contain.
2306 First, check if we can do an aligned read, as most CPUs have
2307 a penalty for unaligned reads.
2308 */
2309 if (!((size_t) s & LONG_PTR_MASK)) {
2310 /* Help register allocation */
2311 register const char *_s = s;
2312 register Py_UNICODE *_p = p;
2313 while (_s < aligned_end) {
2314 /* Read a whole long at a time (either 4 or 8 bytes),
2315 and do a fast unrolled copy if it only contains ASCII
2316 characters. */
2317 unsigned long data = *(unsigned long *) _s;
2318 if (data & ASCII_CHAR_MASK)
2319 break;
2320 _p[0] = (unsigned char) _s[0];
2321 _p[1] = (unsigned char) _s[1];
2322 _p[2] = (unsigned char) _s[2];
2323 _p[3] = (unsigned char) _s[3];
2324#if (SIZEOF_LONG == 8)
2325 _p[4] = (unsigned char) _s[4];
2326 _p[5] = (unsigned char) _s[5];
2327 _p[6] = (unsigned char) _s[6];
2328 _p[7] = (unsigned char) _s[7];
2329#endif
2330 _s += SIZEOF_LONG;
2331 _p += SIZEOF_LONG;
2332 }
2333 s = _s;
2334 p = _p;
2335 if (s == e)
2336 break;
2337 ch = (unsigned char)*s;
2338 }
2339 }
2340
2341 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002342 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 s++;
2344 continue;
2345 }
2346
2347 n = utf8_code_length[ch];
2348
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002349 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002350 if (consumed)
2351 break;
2352 else {
2353 errmsg = "unexpected end of data";
2354 startinpos = s-starts;
2355 endinpos = size;
2356 goto utf8Error;
2357 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359
2360 switch (n) {
2361
2362 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002363 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 startinpos = s-starts;
2365 endinpos = startinpos+1;
2366 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
2368 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002369 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 startinpos = s-starts;
2371 endinpos = startinpos+1;
2372 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 if ((s[1] & 0xc0) != 0x80) {
2376 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002377 startinpos = s-starts;
2378 endinpos = startinpos+2;
2379 goto utf8Error;
2380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002382 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 startinpos = s-starts;
2384 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002385 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002386 goto utf8Error;
2387 }
2388 else
2389 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 break;
2391
2392 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002393 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002394 (s[2] & 0xc0) != 0x80) {
2395 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002396 startinpos = s-starts;
2397 endinpos = startinpos+3;
2398 goto utf8Error;
2399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002401 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002402 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002403 startinpos = s-starts;
2404 endinpos = startinpos+3;
2405 goto utf8Error;
2406 }
2407 else
2408 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002409 break;
2410
2411 case 4:
2412 if ((s[1] & 0xc0) != 0x80 ||
2413 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002414 (s[3] & 0xc0) != 0x80) {
2415 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 startinpos = s-starts;
2417 endinpos = startinpos+4;
2418 goto utf8Error;
2419 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002420 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002422 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002423 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002425 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 UTF-16 */
2427 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002428 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 startinpos = s-starts;
2430 endinpos = startinpos+4;
2431 goto utf8Error;
2432 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002433#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002435#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002436 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002437
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002438 /* translate from 10000..10FFFF to 0..FFFF */
2439 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002440
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002441 /* high surrogate = top 10 bits added to D800 */
2442 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002443
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002444 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002445 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002446#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 break;
2448
2449 default:
2450 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 startinpos = s-starts;
2453 endinpos = startinpos+n;
2454 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 }
2456 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002458
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 utf8Error:
2460 outpos = p-PyUnicode_AS_UNICODE(unicode);
2461 if (unicode_decode_call_errorhandler(
2462 errors, &errorHandler,
2463 "utf8", errmsg,
2464 &starts, &e, &startinpos, &endinpos, &exc, &s,
2465 &unicode, &outpos, &p))
2466 goto onError;
2467 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
Walter Dörwald69652032004-09-07 20:24:22 +00002469 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002470 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471
2472 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002473 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 goto onError;
2475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 return (PyObject *)unicode;
2479
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 Py_XDECREF(errorHandler);
2482 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 Py_DECREF(unicode);
2484 return NULL;
2485}
2486
Antoine Pitrouab868312009-01-10 15:40:25 +00002487#undef ASCII_CHAR_MASK
2488
2489
Tim Peters602f7402002-04-27 18:03:26 +00002490/* Allocation strategy: if the string is short, convert into a stack buffer
2491 and allocate exactly as much space needed at the end. Else allocate the
2492 maximum possible needed (4 result bytes per Unicode character), and return
2493 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002494*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002495PyObject *
2496PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 Py_ssize_t size,
2498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499{
Tim Peters602f7402002-04-27 18:03:26 +00002500#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002501
Guido van Rossum98297ee2007-11-06 21:34:58 +00002502 Py_ssize_t i; /* index into s of next input byte */
2503 PyObject *result; /* result string object */
2504 char *p; /* next free byte in output buffer */
2505 Py_ssize_t nallocated; /* number of result bytes allocated */
2506 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002507 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002508 PyObject *errorHandler = NULL;
2509 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002510
Tim Peters602f7402002-04-27 18:03:26 +00002511 assert(s != NULL);
2512 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
Tim Peters602f7402002-04-27 18:03:26 +00002514 if (size <= MAX_SHORT_UNICHARS) {
2515 /* Write into the stack buffer; nallocated can't overflow.
2516 * At the end, we'll allocate exactly as much heap space as it
2517 * turns out we need.
2518 */
2519 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002520 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002521 p = stackbuf;
2522 }
2523 else {
2524 /* Overallocate on the heap, and give the excess back at the end. */
2525 nallocated = size * 4;
2526 if (nallocated / 4 != size) /* overflow! */
2527 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002528 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002529 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002530 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002533
Tim Peters602f7402002-04-27 18:03:26 +00002534 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002535 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002536
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002537 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002538 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002540
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002542 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002543 *p++ = (char)(0xc0 | (ch >> 6));
2544 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002545 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002546#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002547 /* Special case: check for high and low surrogate */
2548 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2549 Py_UCS4 ch2 = s[i];
2550 /* Combine the two surrogates to form a UCS4 value */
2551 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2552 i++;
2553
2554 /* Encode UCS4 Unicode ordinals */
2555 *p++ = (char)(0xf0 | (ch >> 18));
2556 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2558 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002559 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002560#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002561 Py_ssize_t newpos;
2562 PyObject *rep;
2563 Py_ssize_t repsize, k;
2564 rep = unicode_encode_call_errorhandler
2565 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2566 s, size, &exc, i-1, i, &newpos);
2567 if (!rep)
2568 goto error;
2569
2570 if (PyBytes_Check(rep))
2571 repsize = PyBytes_GET_SIZE(rep);
2572 else
2573 repsize = PyUnicode_GET_SIZE(rep);
2574
2575 if (repsize > 4) {
2576 Py_ssize_t offset;
2577
2578 if (result == NULL)
2579 offset = p - stackbuf;
2580 else
2581 offset = p - PyBytes_AS_STRING(result);
2582
2583 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2584 /* integer overflow */
2585 PyErr_NoMemory();
2586 goto error;
2587 }
2588 nallocated += repsize - 4;
2589 if (result != NULL) {
2590 if (_PyBytes_Resize(&result, nallocated) < 0)
2591 goto error;
2592 } else {
2593 result = PyBytes_FromStringAndSize(NULL, nallocated);
2594 if (result == NULL)
2595 goto error;
2596 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2597 }
2598 p = PyBytes_AS_STRING(result) + offset;
2599 }
2600
2601 if (PyBytes_Check(rep)) {
2602 char *prep = PyBytes_AS_STRING(rep);
2603 for(k = repsize; k > 0; k--)
2604 *p++ = *prep++;
2605 } else /* rep is unicode */ {
2606 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2607 Py_UNICODE c;
2608
2609 for(k=0; k<repsize; k++) {
2610 c = prep[k];
2611 if (0x80 <= c) {
2612 raise_encode_exception(&exc, "utf-8", s, size,
2613 i-1, i, "surrogates not allowed");
2614 goto error;
2615 }
2616 *p++ = (char)prep[k];
2617 }
2618 }
2619 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002620#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002621 }
Victor Stinner445a6232010-04-22 20:01:57 +00002622#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002623 } else if (ch < 0x10000) {
2624 *p++ = (char)(0xe0 | (ch >> 12));
2625 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2626 *p++ = (char)(0x80 | (ch & 0x3f));
2627 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002628 /* Encode UCS4 Unicode ordinals */
2629 *p++ = (char)(0xf0 | (ch >> 18));
2630 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2631 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2632 *p++ = (char)(0x80 | (ch & 0x3f));
2633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002635
Guido van Rossum98297ee2007-11-06 21:34:58 +00002636 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002637 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002638 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002639 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002640 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002641 }
2642 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002643 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002644 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002645 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002646 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002647 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002648 Py_XDECREF(errorHandler);
2649 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002650 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002651 error:
2652 Py_XDECREF(errorHandler);
2653 Py_XDECREF(exc);
2654 Py_XDECREF(result);
2655 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002656
Tim Peters602f7402002-04-27 18:03:26 +00002657#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658}
2659
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2661{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 if (!PyUnicode_Check(unicode)) {
2663 PyErr_BadArgument();
2664 return NULL;
2665 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002666 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002667 PyUnicode_GET_SIZE(unicode),
2668 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669}
2670
Walter Dörwald41980ca2007-08-16 21:55:45 +00002671/* --- UTF-32 Codec ------------------------------------------------------- */
2672
2673PyObject *
2674PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 Py_ssize_t size,
2676 const char *errors,
2677 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002678{
2679 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2680}
2681
2682PyObject *
2683PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 Py_ssize_t size,
2685 const char *errors,
2686 int *byteorder,
2687 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002688{
2689 const char *starts = s;
2690 Py_ssize_t startinpos;
2691 Py_ssize_t endinpos;
2692 Py_ssize_t outpos;
2693 PyUnicodeObject *unicode;
2694 Py_UNICODE *p;
2695#ifndef Py_UNICODE_WIDE
2696 int i, pairs;
2697#else
2698 const int pairs = 0;
2699#endif
2700 const unsigned char *q, *e;
2701 int bo = 0; /* assume native ordering by default */
2702 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002703 /* Offsets from q for retrieving bytes in the right order. */
2704#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2705 int iorder[] = {0, 1, 2, 3};
2706#else
2707 int iorder[] = {3, 2, 1, 0};
2708#endif
2709 PyObject *errorHandler = NULL;
2710 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002711 /* On narrow builds we split characters outside the BMP into two
2712 codepoints => count how much extra space we need. */
2713#ifndef Py_UNICODE_WIDE
2714 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002715 if (((Py_UCS4 *)s)[i] >= 0x10000)
2716 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002717#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002718
2719 /* This might be one to much, because of a BOM */
2720 unicode = _PyUnicode_New((size+3)/4+pairs);
2721 if (!unicode)
2722 return NULL;
2723 if (size == 0)
2724 return (PyObject *)unicode;
2725
2726 /* Unpack UTF-32 encoded data */
2727 p = unicode->str;
2728 q = (unsigned char *)s;
2729 e = q + size;
2730
2731 if (byteorder)
2732 bo = *byteorder;
2733
2734 /* Check for BOM marks (U+FEFF) in the input and adjust current
2735 byte order setting accordingly. In native mode, the leading BOM
2736 mark is skipped, in all other modes, it is copied to the output
2737 stream as-is (giving a ZWNBSP character). */
2738 if (bo == 0) {
2739 if (size >= 4) {
2740 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 if (bom == 0x0000FEFF) {
2744 q += 4;
2745 bo = -1;
2746 }
2747 else if (bom == 0xFFFE0000) {
2748 q += 4;
2749 bo = 1;
2750 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002751#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 if (bom == 0x0000FEFF) {
2753 q += 4;
2754 bo = 1;
2755 }
2756 else if (bom == 0xFFFE0000) {
2757 q += 4;
2758 bo = -1;
2759 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002760#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002762 }
2763
2764 if (bo == -1) {
2765 /* force LE */
2766 iorder[0] = 0;
2767 iorder[1] = 1;
2768 iorder[2] = 2;
2769 iorder[3] = 3;
2770 }
2771 else if (bo == 1) {
2772 /* force BE */
2773 iorder[0] = 3;
2774 iorder[1] = 2;
2775 iorder[2] = 1;
2776 iorder[3] = 0;
2777 }
2778
2779 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 Py_UCS4 ch;
2781 /* remaining bytes at the end? (size should be divisible by 4) */
2782 if (e-q<4) {
2783 if (consumed)
2784 break;
2785 errmsg = "truncated data";
2786 startinpos = ((const char *)q)-starts;
2787 endinpos = ((const char *)e)-starts;
2788 goto utf32Error;
2789 /* The remaining input chars are ignored if the callback
2790 chooses to skip the input */
2791 }
2792 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2793 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002794
Benjamin Peterson29060642009-01-31 22:14:21 +00002795 if (ch >= 0x110000)
2796 {
2797 errmsg = "codepoint not in range(0x110000)";
2798 startinpos = ((const char *)q)-starts;
2799 endinpos = startinpos+4;
2800 goto utf32Error;
2801 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002802#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 if (ch >= 0x10000)
2804 {
2805 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2806 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2807 }
2808 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002809#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 *p++ = ch;
2811 q += 4;
2812 continue;
2813 utf32Error:
2814 outpos = p-PyUnicode_AS_UNICODE(unicode);
2815 if (unicode_decode_call_errorhandler(
2816 errors, &errorHandler,
2817 "utf32", errmsg,
2818 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2819 &unicode, &outpos, &p))
2820 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002821 }
2822
2823 if (byteorder)
2824 *byteorder = bo;
2825
2826 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002828
2829 /* Adjust length */
2830 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2831 goto onError;
2832
2833 Py_XDECREF(errorHandler);
2834 Py_XDECREF(exc);
2835 return (PyObject *)unicode;
2836
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002838 Py_DECREF(unicode);
2839 Py_XDECREF(errorHandler);
2840 Py_XDECREF(exc);
2841 return NULL;
2842}
2843
2844PyObject *
2845PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 Py_ssize_t size,
2847 const char *errors,
2848 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002849{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002850 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002851 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002852 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002853#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002854 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002855#else
2856 const int pairs = 0;
2857#endif
2858 /* Offsets from p for storing byte pairs in the right order. */
2859#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2860 int iorder[] = {0, 1, 2, 3};
2861#else
2862 int iorder[] = {3, 2, 1, 0};
2863#endif
2864
Benjamin Peterson29060642009-01-31 22:14:21 +00002865#define STORECHAR(CH) \
2866 do { \
2867 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2868 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2869 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2870 p[iorder[0]] = (CH) & 0xff; \
2871 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002872 } while(0)
2873
2874 /* In narrow builds we can output surrogate pairs as one codepoint,
2875 so we need less space. */
2876#ifndef Py_UNICODE_WIDE
2877 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2879 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2880 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002881#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002882 nsize = (size - pairs + (byteorder == 0));
2883 bytesize = nsize * 4;
2884 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002886 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002887 if (v == NULL)
2888 return NULL;
2889
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002890 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002891 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002893 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002894 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895
2896 if (byteorder == -1) {
2897 /* force LE */
2898 iorder[0] = 0;
2899 iorder[1] = 1;
2900 iorder[2] = 2;
2901 iorder[3] = 3;
2902 }
2903 else if (byteorder == 1) {
2904 /* force BE */
2905 iorder[0] = 3;
2906 iorder[1] = 2;
2907 iorder[2] = 1;
2908 iorder[3] = 0;
2909 }
2910
2911 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002913#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2915 Py_UCS4 ch2 = *s;
2916 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2917 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2918 s++;
2919 size--;
2920 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002922#endif
2923 STORECHAR(ch);
2924 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002925
2926 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002927 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002928#undef STORECHAR
2929}
2930
2931PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2932{
2933 if (!PyUnicode_Check(unicode)) {
2934 PyErr_BadArgument();
2935 return NULL;
2936 }
2937 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 PyUnicode_GET_SIZE(unicode),
2939 NULL,
2940 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002941}
2942
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943/* --- UTF-16 Codec ------------------------------------------------------- */
2944
Tim Peters772747b2001-08-09 22:21:55 +00002945PyObject *
2946PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 Py_ssize_t size,
2948 const char *errors,
2949 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950{
Walter Dörwald69652032004-09-07 20:24:22 +00002951 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2952}
2953
Antoine Pitrouab868312009-01-10 15:40:25 +00002954/* Two masks for fast checking of whether a C 'long' may contain
2955 UTF16-encoded surrogate characters. This is an efficient heuristic,
2956 assuming that non-surrogate characters with a code point >= 0x8000 are
2957 rare in most input.
2958 FAST_CHAR_MASK is used when the input is in native byte ordering,
2959 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002960*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002961#if (SIZEOF_LONG == 8)
2962# define FAST_CHAR_MASK 0x8000800080008000L
2963# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2964#elif (SIZEOF_LONG == 4)
2965# define FAST_CHAR_MASK 0x80008000L
2966# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2967#else
2968# error C 'long' size should be either 4 or 8!
2969#endif
2970
Walter Dörwald69652032004-09-07 20:24:22 +00002971PyObject *
2972PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 Py_ssize_t size,
2974 const char *errors,
2975 int *byteorder,
2976 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002977{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002979 Py_ssize_t startinpos;
2980 Py_ssize_t endinpos;
2981 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 PyUnicodeObject *unicode;
2983 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002984 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002985 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002986 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002987 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002988 /* Offsets from q for retrieving byte pairs in the right order. */
2989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2990 int ihi = 1, ilo = 0;
2991#else
2992 int ihi = 0, ilo = 1;
2993#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 PyObject *errorHandler = NULL;
2995 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996
2997 /* Note: size will always be longer than the resulting Unicode
2998 character count */
2999 unicode = _PyUnicode_New(size);
3000 if (!unicode)
3001 return NULL;
3002 if (size == 0)
3003 return (PyObject *)unicode;
3004
3005 /* Unpack UTF-16 encoded data */
3006 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003007 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003008 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009
3010 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003011 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003013 /* Check for BOM marks (U+FEFF) in the input and adjust current
3014 byte order setting accordingly. In native mode, the leading BOM
3015 mark is skipped, in all other modes, it is copied to the output
3016 stream as-is (giving a ZWNBSP character). */
3017 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003018 if (size >= 2) {
3019 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003020#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 if (bom == 0xFEFF) {
3022 q += 2;
3023 bo = -1;
3024 }
3025 else if (bom == 0xFFFE) {
3026 q += 2;
3027 bo = 1;
3028 }
Tim Petersced69f82003-09-16 20:30:58 +00003029#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 if (bom == 0xFEFF) {
3031 q += 2;
3032 bo = 1;
3033 }
3034 else if (bom == 0xFFFE) {
3035 q += 2;
3036 bo = -1;
3037 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003038#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041
Tim Peters772747b2001-08-09 22:21:55 +00003042 if (bo == -1) {
3043 /* force LE */
3044 ihi = 1;
3045 ilo = 0;
3046 }
3047 else if (bo == 1) {
3048 /* force BE */
3049 ihi = 0;
3050 ilo = 1;
3051 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003052#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3053 native_ordering = ilo < ihi;
3054#else
3055 native_ordering = ilo > ihi;
3056#endif
Tim Peters772747b2001-08-09 22:21:55 +00003057
Antoine Pitrouab868312009-01-10 15:40:25 +00003058 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003059 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003061 /* First check for possible aligned read of a C 'long'. Unaligned
3062 reads are more expensive, better to defer to another iteration. */
3063 if (!((size_t) q & LONG_PTR_MASK)) {
3064 /* Fast path for runs of non-surrogate chars. */
3065 register const unsigned char *_q = q;
3066 Py_UNICODE *_p = p;
3067 if (native_ordering) {
3068 /* Native ordering is simple: as long as the input cannot
3069 possibly contain a surrogate char, do an unrolled copy
3070 of several 16-bit code points to the target object.
3071 The non-surrogate check is done on several input bytes
3072 at a time (as many as a C 'long' can contain). */
3073 while (_q < aligned_end) {
3074 unsigned long data = * (unsigned long *) _q;
3075 if (data & FAST_CHAR_MASK)
3076 break;
3077 _p[0] = ((unsigned short *) _q)[0];
3078 _p[1] = ((unsigned short *) _q)[1];
3079#if (SIZEOF_LONG == 8)
3080 _p[2] = ((unsigned short *) _q)[2];
3081 _p[3] = ((unsigned short *) _q)[3];
3082#endif
3083 _q += SIZEOF_LONG;
3084 _p += SIZEOF_LONG / 2;
3085 }
3086 }
3087 else {
3088 /* Byteswapped ordering is similar, but we must decompose
3089 the copy bytewise, and take care of zero'ing out the
3090 upper bytes if the target object is in 32-bit units
3091 (that is, in UCS-4 builds). */
3092 while (_q < aligned_end) {
3093 unsigned long data = * (unsigned long *) _q;
3094 if (data & SWAPPED_FAST_CHAR_MASK)
3095 break;
3096 /* Zero upper bytes in UCS-4 builds */
3097#if (Py_UNICODE_SIZE > 2)
3098 _p[0] = 0;
3099 _p[1] = 0;
3100#if (SIZEOF_LONG == 8)
3101 _p[2] = 0;
3102 _p[3] = 0;
3103#endif
3104#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003105 /* Issue #4916; UCS-4 builds on big endian machines must
3106 fill the two last bytes of each 4-byte unit. */
3107#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3108# define OFF 2
3109#else
3110# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003111#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003112 ((unsigned char *) _p)[OFF + 1] = _q[0];
3113 ((unsigned char *) _p)[OFF + 0] = _q[1];
3114 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3115 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3116#if (SIZEOF_LONG == 8)
3117 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3118 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3119 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3120 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3121#endif
3122#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003123 _q += SIZEOF_LONG;
3124 _p += SIZEOF_LONG / 2;
3125 }
3126 }
3127 p = _p;
3128 q = _q;
3129 if (q >= e)
3130 break;
3131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133
Benjamin Peterson14339b62009-01-31 16:36:08 +00003134 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003135
3136 if (ch < 0xD800 || ch > 0xDFFF) {
3137 *p++ = ch;
3138 continue;
3139 }
3140
3141 /* UTF-16 code pair: */
3142 if (q > e) {
3143 errmsg = "unexpected end of data";
3144 startinpos = (((const char *)q) - 2) - starts;
3145 endinpos = ((const char *)e) + 1 - starts;
3146 goto utf16Error;
3147 }
3148 if (0xD800 <= ch && ch <= 0xDBFF) {
3149 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3150 q += 2;
3151 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003152#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 *p++ = ch;
3154 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003155#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003156 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003157#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 continue;
3159 }
3160 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003161 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 startinpos = (((const char *)q)-4)-starts;
3163 endinpos = startinpos+2;
3164 goto utf16Error;
3165 }
3166
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 errmsg = "illegal encoding";
3169 startinpos = (((const char *)q)-2)-starts;
3170 endinpos = startinpos+2;
3171 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003172
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 utf16Error:
3174 outpos = p - PyUnicode_AS_UNICODE(unicode);
3175 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003176 errors,
3177 &errorHandler,
3178 "utf16", errmsg,
3179 &starts,
3180 (const char **)&e,
3181 &startinpos,
3182 &endinpos,
3183 &exc,
3184 (const char **)&q,
3185 &unicode,
3186 &outpos,
3187 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003190 /* remaining byte at the end? (size should be even) */
3191 if (e == q) {
3192 if (!consumed) {
3193 errmsg = "truncated data";
3194 startinpos = ((const char *)q) - starts;
3195 endinpos = ((const char *)e) + 1 - starts;
3196 outpos = p - PyUnicode_AS_UNICODE(unicode);
3197 if (unicode_decode_call_errorhandler(
3198 errors,
3199 &errorHandler,
3200 "utf16", errmsg,
3201 &starts,
3202 (const char **)&e,
3203 &startinpos,
3204 &endinpos,
3205 &exc,
3206 (const char **)&q,
3207 &unicode,
3208 &outpos,
3209 &p))
3210 goto onError;
3211 /* The remaining input chars are ignored if the callback
3212 chooses to skip the input */
3213 }
3214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215
3216 if (byteorder)
3217 *byteorder = bo;
3218
Walter Dörwald69652032004-09-07 20:24:22 +00003219 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003223 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 goto onError;
3225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 Py_XDECREF(errorHandler);
3227 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 return (PyObject *)unicode;
3229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_XDECREF(errorHandler);
3233 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return NULL;
3235}
3236
Antoine Pitrouab868312009-01-10 15:40:25 +00003237#undef FAST_CHAR_MASK
3238#undef SWAPPED_FAST_CHAR_MASK
3239
Tim Peters772747b2001-08-09 22:21:55 +00003240PyObject *
3241PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_ssize_t size,
3243 const char *errors,
3244 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003246 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003247 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003248 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003249#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003250 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003251#else
3252 const int pairs = 0;
3253#endif
Tim Peters772747b2001-08-09 22:21:55 +00003254 /* Offsets from p for storing byte pairs in the right order. */
3255#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3256 int ihi = 1, ilo = 0;
3257#else
3258 int ihi = 0, ilo = 1;
3259#endif
3260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261#define STORECHAR(CH) \
3262 do { \
3263 p[ihi] = ((CH) >> 8) & 0xff; \
3264 p[ilo] = (CH) & 0xff; \
3265 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003266 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003268#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003269 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 if (s[i] >= 0x10000)
3271 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003272#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003273 /* 2 * (size + pairs + (byteorder == 0)) */
3274 if (size > PY_SSIZE_T_MAX ||
3275 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003277 nsize = size + pairs + (byteorder == 0);
3278 bytesize = nsize * 2;
3279 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003281 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 if (v == NULL)
3283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003285 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003288 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003289 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003290
3291 if (byteorder == -1) {
3292 /* force LE */
3293 ihi = 1;
3294 ilo = 0;
3295 }
3296 else if (byteorder == 1) {
3297 /* force BE */
3298 ihi = 0;
3299 ilo = 1;
3300 }
3301
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003302 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 Py_UNICODE ch = *s++;
3304 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003305#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 if (ch >= 0x10000) {
3307 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3308 ch = 0xD800 | ((ch-0x10000) >> 10);
3309 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003310#endif
Tim Peters772747b2001-08-09 22:21:55 +00003311 STORECHAR(ch);
3312 if (ch2)
3313 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003314 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003315
3316 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003317 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003318#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319}
3320
3321PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3322{
3323 if (!PyUnicode_Check(unicode)) {
3324 PyErr_BadArgument();
3325 return NULL;
3326 }
3327 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003328 PyUnicode_GET_SIZE(unicode),
3329 NULL,
3330 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331}
3332
3333/* --- Unicode Escape Codec ----------------------------------------------- */
3334
Fredrik Lundh06d12682001-01-24 07:59:11 +00003335static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 Py_ssize_t size,
3339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003342 Py_ssize_t startinpos;
3343 Py_ssize_t endinpos;
3344 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003349 char* message;
3350 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 PyObject *errorHandler = NULL;
3352 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003353
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 /* Escaped strings will always be longer than the resulting
3355 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 length after conversion to the true value.
3357 (but if the error callback returns a long replacement string
3358 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 v = _PyUnicode_New(size);
3360 if (v == NULL)
3361 goto onError;
3362 if (size == 0)
3363 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 while (s < end) {
3369 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003370 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372
3373 /* Non-escape characters are interpreted as Unicode ordinals */
3374 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003375 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 continue;
3377 }
3378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 /* \ - Escapes */
3381 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003382 c = *s++;
3383 if (s > end)
3384 c = '\0'; /* Invalid after \ */
3385 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 case '\n': break;
3389 case '\\': *p++ = '\\'; break;
3390 case '\'': *p++ = '\''; break;
3391 case '\"': *p++ = '\"'; break;
3392 case 'b': *p++ = '\b'; break;
3393 case 'f': *p++ = '\014'; break; /* FF */
3394 case 't': *p++ = '\t'; break;
3395 case 'n': *p++ = '\n'; break;
3396 case 'r': *p++ = '\r'; break;
3397 case 'v': *p++ = '\013'; break; /* VT */
3398 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3399
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 case '0': case '1': case '2': case '3':
3402 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003403 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003404 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003405 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003406 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003407 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003409 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 break;
3411
Benjamin Peterson29060642009-01-31 22:14:21 +00003412 /* hex escapes */
3413 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003415 digits = 2;
3416 message = "truncated \\xXX escape";
3417 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003421 digits = 4;
3422 message = "truncated \\uXXXX escape";
3423 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003426 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003427 digits = 8;
3428 message = "truncated \\UXXXXXXXX escape";
3429 hexescape:
3430 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 outpos = p-PyUnicode_AS_UNICODE(v);
3432 if (s+digits>end) {
3433 endinpos = size;
3434 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 errors, &errorHandler,
3436 "unicodeescape", "end of string in escape sequence",
3437 &starts, &end, &startinpos, &endinpos, &exc, &s,
3438 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 goto onError;
3440 goto nextByte;
3441 }
3442 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003443 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003444 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 endinpos = (s+i+1)-starts;
3446 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003447 errors, &errorHandler,
3448 "unicodeescape", message,
3449 &starts, &end, &startinpos, &endinpos, &exc, &s,
3450 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003453 }
3454 chr = (chr<<4) & ~0xF;
3455 if (c >= '0' && c <= '9')
3456 chr += c - '0';
3457 else if (c >= 'a' && c <= 'f')
3458 chr += 10 + c - 'a';
3459 else
3460 chr += 10 + c - 'A';
3461 }
3462 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003463 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 /* _decoding_error will have already written into the
3465 target buffer. */
3466 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003467 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003468 /* when we get here, chr is a 32-bit unicode character */
3469 if (chr <= 0xffff)
3470 /* UCS-2 character */
3471 *p++ = (Py_UNICODE) chr;
3472 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003473 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003474 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003475#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003476 *p++ = chr;
3477#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003478 chr -= 0x10000L;
3479 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003480 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003481#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003482 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 endinpos = s-starts;
3484 outpos = p-PyUnicode_AS_UNICODE(v);
3485 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 errors, &errorHandler,
3487 "unicodeescape", "illegal Unicode character",
3488 &starts, &end, &startinpos, &endinpos, &exc, &s,
3489 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003490 goto onError;
3491 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003492 break;
3493
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003495 case 'N':
3496 message = "malformed \\N character escape";
3497 if (ucnhash_CAPI == NULL) {
3498 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003499 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003500 if (ucnhash_CAPI == NULL)
3501 goto ucnhashError;
3502 }
3503 if (*s == '{') {
3504 const char *start = s+1;
3505 /* look for the closing brace */
3506 while (*s != '}' && s < end)
3507 s++;
3508 if (s > start && s < end && *s == '}') {
3509 /* found a name. look it up in the unicode database */
3510 message = "unknown Unicode character name";
3511 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003512 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003513 goto store;
3514 }
3515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 endinpos = s-starts;
3517 outpos = p-PyUnicode_AS_UNICODE(v);
3518 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 errors, &errorHandler,
3520 "unicodeescape", message,
3521 &starts, &end, &startinpos, &endinpos, &exc, &s,
3522 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003523 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003524 break;
3525
3526 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003527 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 message = "\\ at end of string";
3529 s--;
3530 endinpos = s-starts;
3531 outpos = p-PyUnicode_AS_UNICODE(v);
3532 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 errors, &errorHandler,
3534 "unicodeescape", message,
3535 &starts, &end, &startinpos, &endinpos, &exc, &s,
3536 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003537 goto onError;
3538 }
3539 else {
3540 *p++ = '\\';
3541 *p++ = (unsigned char)s[-1];
3542 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003543 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003548 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003550 Py_XDECREF(errorHandler);
3551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003553
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003555 PyErr_SetString(
3556 PyExc_UnicodeError,
3557 "\\N escapes not supported (can't load unicodedata module)"
3558 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003559 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 Py_XDECREF(errorHandler);
3561 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003562 return NULL;
3563
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 return NULL;
3569}
3570
3571/* Return a Unicode-Escape string version of the Unicode object.
3572
3573 If quotes is true, the string is enclosed in u"" or u'' quotes as
3574 appropriate.
3575
3576*/
3577
Thomas Wouters477c8d52006-05-27 19:21:47 +00003578Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 Py_ssize_t size,
3580 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003581{
3582 /* like wcschr, but doesn't stop at NULL characters */
3583
3584 while (size-- > 0) {
3585 if (*s == ch)
3586 return s;
3587 s++;
3588 }
3589
3590 return NULL;
3591}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003592
Walter Dörwald79e913e2007-05-12 11:08:06 +00003593static const char *hexdigits = "0123456789abcdef";
3594
3595PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003598 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003601#ifdef Py_UNICODE_WIDE
3602 const Py_ssize_t expandsize = 10;
3603#else
3604 const Py_ssize_t expandsize = 6;
3605#endif
3606
Thomas Wouters89f507f2006-12-13 04:49:30 +00003607 /* XXX(nnorwitz): rather than over-allocating, it would be
3608 better to choose a different scheme. Perhaps scan the
3609 first N-chars of the string and allocate based on that size.
3610 */
3611 /* Initial allocation is based on the longest-possible unichr
3612 escape.
3613
3614 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3615 unichr, so in this case it's the longest unichr escape. In
3616 narrow (UTF-16) builds this is five chars per source unichr
3617 since there are two unichrs in the surrogate pair, so in narrow
3618 (UTF-16) builds it's not the longest unichr escape.
3619
3620 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3621 so in the narrow (UTF-16) build case it's the longest unichr
3622 escape.
3623 */
3624
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003625 if (size == 0)
3626 return PyBytes_FromStringAndSize(NULL, 0);
3627
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003628 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003630
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003631 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 2
3633 + expandsize*size
3634 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 if (repr == NULL)
3636 return NULL;
3637
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003638 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 while (size-- > 0) {
3641 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003642
Walter Dörwald79e913e2007-05-12 11:08:06 +00003643 /* Escape backslashes */
3644 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 *p++ = '\\';
3646 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003647 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003648 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003649
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003650#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003651 /* Map 21-bit characters to '\U00xxxxxx' */
3652 else if (ch >= 0x10000) {
3653 *p++ = '\\';
3654 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003655 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3656 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3657 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3658 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3659 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3660 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3661 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3662 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003664 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003665#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3667 else if (ch >= 0xD800 && ch < 0xDC00) {
3668 Py_UNICODE ch2;
3669 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003670
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 ch2 = *s++;
3672 size--;
3673 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3674 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3675 *p++ = '\\';
3676 *p++ = 'U';
3677 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3678 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3679 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3680 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3681 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3682 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3683 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3684 *p++ = hexdigits[ucs & 0x0000000F];
3685 continue;
3686 }
3687 /* Fall through: isolated surrogates are copied as-is */
3688 s--;
3689 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003690 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003691#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003692
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003694 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 *p++ = '\\';
3696 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003697 *p++ = hexdigits[(ch >> 12) & 0x000F];
3698 *p++ = hexdigits[(ch >> 8) & 0x000F];
3699 *p++ = hexdigits[(ch >> 4) & 0x000F];
3700 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003702
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003703 /* Map special whitespace to '\t', \n', '\r' */
3704 else if (ch == '\t') {
3705 *p++ = '\\';
3706 *p++ = 't';
3707 }
3708 else if (ch == '\n') {
3709 *p++ = '\\';
3710 *p++ = 'n';
3711 }
3712 else if (ch == '\r') {
3713 *p++ = '\\';
3714 *p++ = 'r';
3715 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003716
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003717 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003718 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003720 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003721 *p++ = hexdigits[(ch >> 4) & 0x000F];
3722 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003723 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003724
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 /* Copy everything else as-is */
3726 else
3727 *p++ = (char) ch;
3728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003730 assert(p - PyBytes_AS_STRING(repr) > 0);
3731 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3732 return NULL;
3733 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734}
3735
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003736PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003738 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 if (!PyUnicode_Check(unicode)) {
3740 PyErr_BadArgument();
3741 return NULL;
3742 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003743 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3744 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003745 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746}
3747
3748/* --- Raw Unicode Escape Codec ------------------------------------------- */
3749
3750PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 Py_ssize_t size,
3752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003755 Py_ssize_t startinpos;
3756 Py_ssize_t endinpos;
3757 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 const char *end;
3761 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 PyObject *errorHandler = NULL;
3763 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 /* Escaped strings will always be longer than the resulting
3766 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 length after conversion to the true value. (But decoding error
3768 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 v = _PyUnicode_New(size);
3770 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 end = s + size;
3776 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 unsigned char c;
3778 Py_UCS4 x;
3779 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003780 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 /* Non-escape characters are interpreted as Unicode ordinals */
3783 if (*s != '\\') {
3784 *p++ = (unsigned char)*s++;
3785 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 startinpos = s-starts;
3788
3789 /* \u-escapes are only interpreted iff the number of leading
3790 backslashes if odd */
3791 bs = s;
3792 for (;s < end;) {
3793 if (*s != '\\')
3794 break;
3795 *p++ = (unsigned char)*s++;
3796 }
3797 if (((s - bs) & 1) == 0 ||
3798 s >= end ||
3799 (*s != 'u' && *s != 'U')) {
3800 continue;
3801 }
3802 p--;
3803 count = *s=='u' ? 4 : 8;
3804 s++;
3805
3806 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3807 outpos = p-PyUnicode_AS_UNICODE(v);
3808 for (x = 0, i = 0; i < count; ++i, ++s) {
3809 c = (unsigned char)*s;
3810 if (!ISXDIGIT(c)) {
3811 endinpos = s-starts;
3812 if (unicode_decode_call_errorhandler(
3813 errors, &errorHandler,
3814 "rawunicodeescape", "truncated \\uXXXX",
3815 &starts, &end, &startinpos, &endinpos, &exc, &s,
3816 &v, &outpos, &p))
3817 goto onError;
3818 goto nextByte;
3819 }
3820 x = (x<<4) & ~0xF;
3821 if (c >= '0' && c <= '9')
3822 x += c - '0';
3823 else if (c >= 'a' && c <= 'f')
3824 x += 10 + c - 'a';
3825 else
3826 x += 10 + c - 'A';
3827 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003828 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 /* UCS-2 character */
3830 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003831 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 /* UCS-4 character. Either store directly, or as
3833 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003834#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003835 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003836#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 x -= 0x10000L;
3838 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3839 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003840#endif
3841 } else {
3842 endinpos = s-starts;
3843 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003844 if (unicode_decode_call_errorhandler(
3845 errors, &errorHandler,
3846 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003847 &starts, &end, &startinpos, &endinpos, &exc, &s,
3848 &v, &outpos, &p))
3849 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003850 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 nextByte:
3852 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003854 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 Py_XDECREF(errorHandler);
3857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 Py_XDECREF(errorHandler);
3863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 return NULL;
3865}
3866
3867PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003870 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 char *p;
3872 char *q;
3873
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003874#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003875 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003876#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003877 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003878#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003879
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003880 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003882
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003883 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 if (repr == NULL)
3885 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003886 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003887 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003889 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 while (size-- > 0) {
3891 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003892#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 /* Map 32-bit characters to '\Uxxxxxxxx' */
3894 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003895 *p++ = '\\';
3896 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003897 *p++ = hexdigits[(ch >> 28) & 0xf];
3898 *p++ = hexdigits[(ch >> 24) & 0xf];
3899 *p++ = hexdigits[(ch >> 20) & 0xf];
3900 *p++ = hexdigits[(ch >> 16) & 0xf];
3901 *p++ = hexdigits[(ch >> 12) & 0xf];
3902 *p++ = hexdigits[(ch >> 8) & 0xf];
3903 *p++ = hexdigits[(ch >> 4) & 0xf];
3904 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003905 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003906 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003907#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3909 if (ch >= 0xD800 && ch < 0xDC00) {
3910 Py_UNICODE ch2;
3911 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003912
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 ch2 = *s++;
3914 size--;
3915 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3916 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3917 *p++ = '\\';
3918 *p++ = 'U';
3919 *p++ = hexdigits[(ucs >> 28) & 0xf];
3920 *p++ = hexdigits[(ucs >> 24) & 0xf];
3921 *p++ = hexdigits[(ucs >> 20) & 0xf];
3922 *p++ = hexdigits[(ucs >> 16) & 0xf];
3923 *p++ = hexdigits[(ucs >> 12) & 0xf];
3924 *p++ = hexdigits[(ucs >> 8) & 0xf];
3925 *p++ = hexdigits[(ucs >> 4) & 0xf];
3926 *p++ = hexdigits[ucs & 0xf];
3927 continue;
3928 }
3929 /* Fall through: isolated surrogates are copied as-is */
3930 s--;
3931 size++;
3932 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003933#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 /* Map 16-bit characters to '\uxxxx' */
3935 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 *p++ = '\\';
3937 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003938 *p++ = hexdigits[(ch >> 12) & 0xf];
3939 *p++ = hexdigits[(ch >> 8) & 0xf];
3940 *p++ = hexdigits[(ch >> 4) & 0xf];
3941 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 /* Copy everything else as-is */
3944 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 *p++ = (char) ch;
3946 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003947 size = p - q;
3948
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003949 assert(size > 0);
3950 if (_PyBytes_Resize(&repr, size) < 0)
3951 return NULL;
3952 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953}
3954
3955PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3956{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003957 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003959 PyErr_BadArgument();
3960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003962 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3963 PyUnicode_GET_SIZE(unicode));
3964
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003965 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966}
3967
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003968/* --- Unicode Internal Codec ------------------------------------------- */
3969
3970PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 Py_ssize_t size,
3972 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003973{
3974 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t startinpos;
3976 Py_ssize_t endinpos;
3977 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003978 PyUnicodeObject *v;
3979 Py_UNICODE *p;
3980 const char *end;
3981 const char *reason;
3982 PyObject *errorHandler = NULL;
3983 PyObject *exc = NULL;
3984
Neal Norwitzd43069c2006-01-08 01:12:10 +00003985#ifdef Py_UNICODE_WIDE
3986 Py_UNICODE unimax = PyUnicode_GetMax();
3987#endif
3988
Thomas Wouters89f507f2006-12-13 04:49:30 +00003989 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003990 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3991 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003993 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003995 p = PyUnicode_AS_UNICODE(v);
3996 end = s + size;
3997
3998 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003999 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004000 /* We have to sanity check the raw data, otherwise doom looms for
4001 some malformed UCS-4 data. */
4002 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004003#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004004 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004005#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004006 end-s < Py_UNICODE_SIZE
4007 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004009 startinpos = s - starts;
4010 if (end-s < Py_UNICODE_SIZE) {
4011 endinpos = end-starts;
4012 reason = "truncated input";
4013 }
4014 else {
4015 endinpos = s - starts + Py_UNICODE_SIZE;
4016 reason = "illegal code point (> 0x10FFFF)";
4017 }
4018 outpos = p - PyUnicode_AS_UNICODE(v);
4019 if (unicode_decode_call_errorhandler(
4020 errors, &errorHandler,
4021 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004022 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004023 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004024 goto onError;
4025 }
4026 }
4027 else {
4028 p++;
4029 s += Py_UNICODE_SIZE;
4030 }
4031 }
4032
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004033 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004034 goto onError;
4035 Py_XDECREF(errorHandler);
4036 Py_XDECREF(exc);
4037 return (PyObject *)v;
4038
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004040 Py_XDECREF(v);
4041 Py_XDECREF(errorHandler);
4042 Py_XDECREF(exc);
4043 return NULL;
4044}
4045
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046/* --- Latin-1 Codec ------------------------------------------------------ */
4047
4048PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 Py_ssize_t size,
4050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051{
4052 PyUnicodeObject *v;
4053 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004054 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004055
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004057 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 Py_UNICODE r = *(unsigned char*)s;
4059 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004060 }
4061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 v = _PyUnicode_New(size);
4063 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004068 e = s + size;
4069 /* Unrolling the copy makes it much faster by reducing the looping
4070 overhead. This is similar to what many memcpy() implementations do. */
4071 unrolled_end = e - 4;
4072 while (s < unrolled_end) {
4073 p[0] = (unsigned char) s[0];
4074 p[1] = (unsigned char) s[1];
4075 p[2] = (unsigned char) s[2];
4076 p[3] = (unsigned char) s[3];
4077 s += 4;
4078 p += 4;
4079 }
4080 while (s < e)
4081 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004083
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 Py_XDECREF(v);
4086 return NULL;
4087}
4088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089/* create or adjust a UnicodeEncodeError */
4090static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 const char *encoding,
4092 const Py_UNICODE *unicode, Py_ssize_t size,
4093 Py_ssize_t startpos, Py_ssize_t endpos,
4094 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 *exceptionObject = PyUnicodeEncodeError_Create(
4098 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 }
4100 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4102 goto onError;
4103 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4104 goto onError;
4105 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4106 goto onError;
4107 return;
4108 onError:
4109 Py_DECREF(*exceptionObject);
4110 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 }
4112}
4113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114/* raises a UnicodeEncodeError */
4115static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 const char *encoding,
4117 const Py_UNICODE *unicode, Py_ssize_t size,
4118 Py_ssize_t startpos, Py_ssize_t endpos,
4119 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120{
4121 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125}
4126
4127/* error handling callback helper:
4128 build arguments, call the callback and check the arguments,
4129 put the result into newpos and return the replacement string, which
4130 has to be freed by the caller */
4131static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 PyObject **errorHandler,
4133 const char *encoding, const char *reason,
4134 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4135 Py_ssize_t startpos, Py_ssize_t endpos,
4136 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004138 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139
4140 PyObject *restuple;
4141 PyObject *resunicode;
4142
4143 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 }
4148
4149 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153
4154 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004159 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 Py_DECREF(restuple);
4161 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004163 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 &resunicode, newpos)) {
4165 Py_DECREF(restuple);
4166 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004168 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4169 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4170 Py_DECREF(restuple);
4171 return NULL;
4172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4177 Py_DECREF(restuple);
4178 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 Py_INCREF(resunicode);
4181 Py_DECREF(restuple);
4182 return resunicode;
4183}
4184
4185static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 Py_ssize_t size,
4187 const char *errors,
4188 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189{
4190 /* output object */
4191 PyObject *res;
4192 /* pointers to the beginning and end+1 of input */
4193 const Py_UNICODE *startp = p;
4194 const Py_UNICODE *endp = p + size;
4195 /* pointer to the beginning of the unencodable characters */
4196 /* const Py_UNICODE *badp = NULL; */
4197 /* pointer into the output */
4198 char *str;
4199 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004200 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004201 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4202 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 PyObject *errorHandler = NULL;
4204 PyObject *exc = NULL;
4205 /* the following variable is used for caching string comparisons
4206 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4207 int known_errorHandler = -1;
4208
4209 /* allocate enough for a simple encoding without
4210 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004211 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004212 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004213 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004215 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004216 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 ressize = size;
4218
4219 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221
Benjamin Peterson29060642009-01-31 22:14:21 +00004222 /* can we encode this? */
4223 if (c<limit) {
4224 /* no overflow check, because we know that the space is enough */
4225 *str++ = (char)c;
4226 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 else {
4229 Py_ssize_t unicodepos = p-startp;
4230 Py_ssize_t requiredsize;
4231 PyObject *repunicode;
4232 Py_ssize_t repsize;
4233 Py_ssize_t newpos;
4234 Py_ssize_t respos;
4235 Py_UNICODE *uni2;
4236 /* startpos for collecting unencodable chars */
4237 const Py_UNICODE *collstart = p;
4238 const Py_UNICODE *collend = p;
4239 /* find all unecodable characters */
4240 while ((collend < endp) && ((*collend)>=limit))
4241 ++collend;
4242 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4243 if (known_errorHandler==-1) {
4244 if ((errors==NULL) || (!strcmp(errors, "strict")))
4245 known_errorHandler = 1;
4246 else if (!strcmp(errors, "replace"))
4247 known_errorHandler = 2;
4248 else if (!strcmp(errors, "ignore"))
4249 known_errorHandler = 3;
4250 else if (!strcmp(errors, "xmlcharrefreplace"))
4251 known_errorHandler = 4;
4252 else
4253 known_errorHandler = 0;
4254 }
4255 switch (known_errorHandler) {
4256 case 1: /* strict */
4257 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4258 goto onError;
4259 case 2: /* replace */
4260 while (collstart++<collend)
4261 *str++ = '?'; /* fall through */
4262 case 3: /* ignore */
4263 p = collend;
4264 break;
4265 case 4: /* xmlcharrefreplace */
4266 respos = str - PyBytes_AS_STRING(res);
4267 /* determine replacement size (temporarily (mis)uses p) */
4268 for (p = collstart, repsize = 0; p < collend; ++p) {
4269 if (*p<10)
4270 repsize += 2+1+1;
4271 else if (*p<100)
4272 repsize += 2+2+1;
4273 else if (*p<1000)
4274 repsize += 2+3+1;
4275 else if (*p<10000)
4276 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004277#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 else
4279 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004280#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 else if (*p<100000)
4282 repsize += 2+5+1;
4283 else if (*p<1000000)
4284 repsize += 2+6+1;
4285 else
4286 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004287#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 }
4289 requiredsize = respos+repsize+(endp-collend);
4290 if (requiredsize > ressize) {
4291 if (requiredsize<2*ressize)
4292 requiredsize = 2*ressize;
4293 if (_PyBytes_Resize(&res, requiredsize))
4294 goto onError;
4295 str = PyBytes_AS_STRING(res) + respos;
4296 ressize = requiredsize;
4297 }
4298 /* generate replacement (temporarily (mis)uses p) */
4299 for (p = collstart; p < collend; ++p) {
4300 str += sprintf(str, "&#%d;", (int)*p);
4301 }
4302 p = collend;
4303 break;
4304 default:
4305 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4306 encoding, reason, startp, size, &exc,
4307 collstart-startp, collend-startp, &newpos);
4308 if (repunicode == NULL)
4309 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004310 if (PyBytes_Check(repunicode)) {
4311 /* Directly copy bytes result to output. */
4312 repsize = PyBytes_Size(repunicode);
4313 if (repsize > 1) {
4314 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004315 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004316 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4317 Py_DECREF(repunicode);
4318 goto onError;
4319 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004320 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004321 ressize += repsize-1;
4322 }
4323 memcpy(str, PyBytes_AsString(repunicode), repsize);
4324 str += repsize;
4325 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004326 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004327 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004328 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 /* need more space? (at least enough for what we
4330 have+the replacement+the rest of the string, so
4331 we won't have to check space for encodable characters) */
4332 respos = str - PyBytes_AS_STRING(res);
4333 repsize = PyUnicode_GET_SIZE(repunicode);
4334 requiredsize = respos+repsize+(endp-collend);
4335 if (requiredsize > ressize) {
4336 if (requiredsize<2*ressize)
4337 requiredsize = 2*ressize;
4338 if (_PyBytes_Resize(&res, requiredsize)) {
4339 Py_DECREF(repunicode);
4340 goto onError;
4341 }
4342 str = PyBytes_AS_STRING(res) + respos;
4343 ressize = requiredsize;
4344 }
4345 /* check if there is anything unencodable in the replacement
4346 and copy it to the output */
4347 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4348 c = *uni2;
4349 if (c >= limit) {
4350 raise_encode_exception(&exc, encoding, startp, size,
4351 unicodepos, unicodepos+1, reason);
4352 Py_DECREF(repunicode);
4353 goto onError;
4354 }
4355 *str = (char)c;
4356 }
4357 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004358 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004359 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004360 }
4361 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004362 /* Resize if we allocated to much */
4363 size = str - PyBytes_AS_STRING(res);
4364 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004365 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004366 if (_PyBytes_Resize(&res, size) < 0)
4367 goto onError;
4368 }
4369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 Py_XDECREF(errorHandler);
4371 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004372 return res;
4373
4374 onError:
4375 Py_XDECREF(res);
4376 Py_XDECREF(errorHandler);
4377 Py_XDECREF(exc);
4378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379}
4380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 Py_ssize_t size,
4383 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386}
4387
4388PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4389{
4390 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 PyErr_BadArgument();
4392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 }
4394 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 PyUnicode_GET_SIZE(unicode),
4396 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397}
4398
4399/* --- 7-bit ASCII Codec -------------------------------------------------- */
4400
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 Py_ssize_t size,
4403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 PyUnicodeObject *v;
4407 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004408 Py_ssize_t startinpos;
4409 Py_ssize_t endinpos;
4410 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 const char *e;
4412 PyObject *errorHandler = NULL;
4413 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004416 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 Py_UNICODE r = *(unsigned char*)s;
4418 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004419 }
Tim Petersced69f82003-09-16 20:30:58 +00004420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 v = _PyUnicode_New(size);
4422 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 e = s + size;
4428 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 register unsigned char c = (unsigned char)*s;
4430 if (c < 128) {
4431 *p++ = c;
4432 ++s;
4433 }
4434 else {
4435 startinpos = s-starts;
4436 endinpos = startinpos + 1;
4437 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4438 if (unicode_decode_call_errorhandler(
4439 errors, &errorHandler,
4440 "ascii", "ordinal not in range(128)",
4441 &starts, &e, &startinpos, &endinpos, &exc, &s,
4442 &v, &outpos, &p))
4443 goto onError;
4444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004446 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 Py_XDECREF(errorHandler);
4450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 return NULL;
4458}
4459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 Py_ssize_t size,
4462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465}
4466
4467PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4468{
4469 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 PyErr_BadArgument();
4471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
4473 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 PyUnicode_GET_SIZE(unicode),
4475 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476}
4477
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004478#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004479
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004480/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004481
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004482#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004483#define NEED_RETRY
4484#endif
4485
4486/* XXX This code is limited to "true" double-byte encodings, as
4487 a) it assumes an incomplete character consists of a single byte, and
4488 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004490
4491static int is_dbcs_lead_byte(const char *s, int offset)
4492{
4493 const char *curr = s + offset;
4494
4495 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 const char *prev = CharPrev(s, curr);
4497 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498 }
4499 return 0;
4500}
4501
4502/*
4503 * Decode MBCS string into unicode object. If 'final' is set, converts
4504 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4505 */
4506static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 const char *s, /* MBCS string */
4508 int size, /* sizeof MBCS string */
4509 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004510{
4511 Py_UNICODE *p;
4512 Py_ssize_t n = 0;
4513 int usize = 0;
4514
4515 assert(size >= 0);
4516
4517 /* Skip trailing lead-byte unless 'final' is set */
4518 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004520
4521 /* First get the size of the result */
4522 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4524 if (usize == 0) {
4525 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4526 return -1;
4527 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528 }
4529
4530 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 /* Create unicode object */
4532 *v = _PyUnicode_New(usize);
4533 if (*v == NULL)
4534 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004535 }
4536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 /* Extend unicode object */
4538 n = PyUnicode_GET_SIZE(*v);
4539 if (_PyUnicode_Resize(v, n + usize) < 0)
4540 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004541 }
4542
4543 /* Do the conversion */
4544 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 p = PyUnicode_AS_UNICODE(*v) + n;
4546 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4547 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4548 return -1;
4549 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004550 }
4551
4552 return size;
4553}
4554
4555PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 Py_ssize_t size,
4557 const char *errors,
4558 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004559{
4560 PyUnicodeObject *v = NULL;
4561 int done;
4562
4563 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565
4566#ifdef NEED_RETRY
4567 retry:
4568 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004570 else
4571#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573
4574 if (done < 0) {
4575 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004577 }
4578
4579 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581
4582#ifdef NEED_RETRY
4583 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 s += done;
4585 size -= done;
4586 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587 }
4588#endif
4589
4590 return (PyObject *)v;
4591}
4592
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004593PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 Py_ssize_t size,
4595 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004596{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004597 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4598}
4599
4600/*
4601 * Convert unicode into string object (MBCS).
4602 * Returns 0 if succeed, -1 otherwise.
4603 */
4604static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 const Py_UNICODE *p, /* unicode */
4606 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004607{
4608 int mbcssize = 0;
4609 Py_ssize_t n = 0;
4610
4611 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004612
4613 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4616 if (mbcssize == 0) {
4617 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4618 return -1;
4619 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004620 }
4621
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004622 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 /* Create string object */
4624 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4625 if (*repr == NULL)
4626 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004627 }
4628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 /* Extend string object */
4630 n = PyBytes_Size(*repr);
4631 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4632 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004633 }
4634
4635 /* Do the conversion */
4636 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 char *s = PyBytes_AS_STRING(*repr) + n;
4638 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4639 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4640 return -1;
4641 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004642 }
4643
4644 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004645}
4646
4647PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 Py_ssize_t size,
4649 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004650{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004651 PyObject *repr = NULL;
4652 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004653
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004654#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004656 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658 else
4659#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004661
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004662 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 Py_XDECREF(repr);
4664 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004665 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004666
4667#ifdef NEED_RETRY
4668 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 p += INT_MAX;
4670 size -= INT_MAX;
4671 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672 }
4673#endif
4674
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004675 return repr;
4676}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004677
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004678PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4679{
4680 if (!PyUnicode_Check(unicode)) {
4681 PyErr_BadArgument();
4682 return NULL;
4683 }
4684 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 PyUnicode_GET_SIZE(unicode),
4686 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004687}
4688
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004689#undef NEED_RETRY
4690
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004691#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004692
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693/* --- Character Mapping Codec -------------------------------------------- */
4694
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 Py_ssize_t size,
4697 PyObject *mapping,
4698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 Py_ssize_t startinpos;
4702 Py_ssize_t endinpos;
4703 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 PyUnicodeObject *v;
4706 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 PyObject *errorHandler = NULL;
4709 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004710 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004711 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004712
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 /* Default to Latin-1 */
4714 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716
4717 v = _PyUnicode_New(size);
4718 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004724 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 mapstring = PyUnicode_AS_UNICODE(mapping);
4726 maplen = PyUnicode_GET_SIZE(mapping);
4727 while (s < e) {
4728 unsigned char ch = *s;
4729 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 if (ch < maplen)
4732 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 if (x == 0xfffe) {
4735 /* undefined mapping */
4736 outpos = p-PyUnicode_AS_UNICODE(v);
4737 startinpos = s-starts;
4738 endinpos = startinpos+1;
4739 if (unicode_decode_call_errorhandler(
4740 errors, &errorHandler,
4741 "charmap", "character maps to <undefined>",
4742 &starts, &e, &startinpos, &endinpos, &exc, &s,
4743 &v, &outpos, &p)) {
4744 goto onError;
4745 }
4746 continue;
4747 }
4748 *p++ = x;
4749 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004750 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004751 }
4752 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 while (s < e) {
4754 unsigned char ch = *s;
4755 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004756
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4758 w = PyLong_FromLong((long)ch);
4759 if (w == NULL)
4760 goto onError;
4761 x = PyObject_GetItem(mapping, w);
4762 Py_DECREF(w);
4763 if (x == NULL) {
4764 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4765 /* No mapping found means: mapping is undefined. */
4766 PyErr_Clear();
4767 x = Py_None;
4768 Py_INCREF(x);
4769 } else
4770 goto onError;
4771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004772
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 /* Apply mapping */
4774 if (PyLong_Check(x)) {
4775 long value = PyLong_AS_LONG(x);
4776 if (value < 0 || value > 65535) {
4777 PyErr_SetString(PyExc_TypeError,
4778 "character mapping must be in range(65536)");
4779 Py_DECREF(x);
4780 goto onError;
4781 }
4782 *p++ = (Py_UNICODE)value;
4783 }
4784 else if (x == Py_None) {
4785 /* undefined mapping */
4786 outpos = p-PyUnicode_AS_UNICODE(v);
4787 startinpos = s-starts;
4788 endinpos = startinpos+1;
4789 if (unicode_decode_call_errorhandler(
4790 errors, &errorHandler,
4791 "charmap", "character maps to <undefined>",
4792 &starts, &e, &startinpos, &endinpos, &exc, &s,
4793 &v, &outpos, &p)) {
4794 Py_DECREF(x);
4795 goto onError;
4796 }
4797 Py_DECREF(x);
4798 continue;
4799 }
4800 else if (PyUnicode_Check(x)) {
4801 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004802
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (targetsize == 1)
4804 /* 1-1 mapping */
4805 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 else if (targetsize > 1) {
4808 /* 1-n mapping */
4809 if (targetsize > extrachars) {
4810 /* resize first */
4811 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4812 Py_ssize_t needed = (targetsize - extrachars) + \
4813 (targetsize << 2);
4814 extrachars += needed;
4815 /* XXX overflow detection missing */
4816 if (_PyUnicode_Resize(&v,
4817 PyUnicode_GET_SIZE(v) + needed) < 0) {
4818 Py_DECREF(x);
4819 goto onError;
4820 }
4821 p = PyUnicode_AS_UNICODE(v) + oldpos;
4822 }
4823 Py_UNICODE_COPY(p,
4824 PyUnicode_AS_UNICODE(x),
4825 targetsize);
4826 p += targetsize;
4827 extrachars -= targetsize;
4828 }
4829 /* 1-0 mapping: skip the character */
4830 }
4831 else {
4832 /* wrong return value */
4833 PyErr_SetString(PyExc_TypeError,
4834 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004835 Py_DECREF(x);
4836 goto onError;
4837 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 Py_DECREF(x);
4839 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
4842 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004848
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 Py_XDECREF(v);
4853 return NULL;
4854}
4855
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004856/* Charmap encoding: the lookup table */
4857
4858struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 PyObject_HEAD
4860 unsigned char level1[32];
4861 int count2, count3;
4862 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004863};
4864
4865static PyObject*
4866encoding_map_size(PyObject *obj, PyObject* args)
4867{
4868 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004869 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004871}
4872
4873static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004874 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 PyDoc_STR("Return the size (in bytes) of this object") },
4876 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004877};
4878
4879static void
4880encoding_map_dealloc(PyObject* o)
4881{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004882 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004883}
4884
4885static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004886 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 "EncodingMap", /*tp_name*/
4888 sizeof(struct encoding_map), /*tp_basicsize*/
4889 0, /*tp_itemsize*/
4890 /* methods */
4891 encoding_map_dealloc, /*tp_dealloc*/
4892 0, /*tp_print*/
4893 0, /*tp_getattr*/
4894 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004895 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 0, /*tp_repr*/
4897 0, /*tp_as_number*/
4898 0, /*tp_as_sequence*/
4899 0, /*tp_as_mapping*/
4900 0, /*tp_hash*/
4901 0, /*tp_call*/
4902 0, /*tp_str*/
4903 0, /*tp_getattro*/
4904 0, /*tp_setattro*/
4905 0, /*tp_as_buffer*/
4906 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4907 0, /*tp_doc*/
4908 0, /*tp_traverse*/
4909 0, /*tp_clear*/
4910 0, /*tp_richcompare*/
4911 0, /*tp_weaklistoffset*/
4912 0, /*tp_iter*/
4913 0, /*tp_iternext*/
4914 encoding_map_methods, /*tp_methods*/
4915 0, /*tp_members*/
4916 0, /*tp_getset*/
4917 0, /*tp_base*/
4918 0, /*tp_dict*/
4919 0, /*tp_descr_get*/
4920 0, /*tp_descr_set*/
4921 0, /*tp_dictoffset*/
4922 0, /*tp_init*/
4923 0, /*tp_alloc*/
4924 0, /*tp_new*/
4925 0, /*tp_free*/
4926 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004927};
4928
4929PyObject*
4930PyUnicode_BuildEncodingMap(PyObject* string)
4931{
4932 Py_UNICODE *decode;
4933 PyObject *result;
4934 struct encoding_map *mresult;
4935 int i;
4936 int need_dict = 0;
4937 unsigned char level1[32];
4938 unsigned char level2[512];
4939 unsigned char *mlevel1, *mlevel2, *mlevel3;
4940 int count2 = 0, count3 = 0;
4941
4942 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4943 PyErr_BadArgument();
4944 return NULL;
4945 }
4946 decode = PyUnicode_AS_UNICODE(string);
4947 memset(level1, 0xFF, sizeof level1);
4948 memset(level2, 0xFF, sizeof level2);
4949
4950 /* If there isn't a one-to-one mapping of NULL to \0,
4951 or if there are non-BMP characters, we need to use
4952 a mapping dictionary. */
4953 if (decode[0] != 0)
4954 need_dict = 1;
4955 for (i = 1; i < 256; i++) {
4956 int l1, l2;
4957 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004958#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004959 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004960#endif
4961 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004962 need_dict = 1;
4963 break;
4964 }
4965 if (decode[i] == 0xFFFE)
4966 /* unmapped character */
4967 continue;
4968 l1 = decode[i] >> 11;
4969 l2 = decode[i] >> 7;
4970 if (level1[l1] == 0xFF)
4971 level1[l1] = count2++;
4972 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004973 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004974 }
4975
4976 if (count2 >= 0xFF || count3 >= 0xFF)
4977 need_dict = 1;
4978
4979 if (need_dict) {
4980 PyObject *result = PyDict_New();
4981 PyObject *key, *value;
4982 if (!result)
4983 return NULL;
4984 for (i = 0; i < 256; i++) {
4985 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004986 key = PyLong_FromLong(decode[i]);
4987 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004988 if (!key || !value)
4989 goto failed1;
4990 if (PyDict_SetItem(result, key, value) == -1)
4991 goto failed1;
4992 Py_DECREF(key);
4993 Py_DECREF(value);
4994 }
4995 return result;
4996 failed1:
4997 Py_XDECREF(key);
4998 Py_XDECREF(value);
4999 Py_DECREF(result);
5000 return NULL;
5001 }
5002
5003 /* Create a three-level trie */
5004 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5005 16*count2 + 128*count3 - 1);
5006 if (!result)
5007 return PyErr_NoMemory();
5008 PyObject_Init(result, &EncodingMapType);
5009 mresult = (struct encoding_map*)result;
5010 mresult->count2 = count2;
5011 mresult->count3 = count3;
5012 mlevel1 = mresult->level1;
5013 mlevel2 = mresult->level23;
5014 mlevel3 = mresult->level23 + 16*count2;
5015 memcpy(mlevel1, level1, 32);
5016 memset(mlevel2, 0xFF, 16*count2);
5017 memset(mlevel3, 0, 128*count3);
5018 count3 = 0;
5019 for (i = 1; i < 256; i++) {
5020 int o1, o2, o3, i2, i3;
5021 if (decode[i] == 0xFFFE)
5022 /* unmapped character */
5023 continue;
5024 o1 = decode[i]>>11;
5025 o2 = (decode[i]>>7) & 0xF;
5026 i2 = 16*mlevel1[o1] + o2;
5027 if (mlevel2[i2] == 0xFF)
5028 mlevel2[i2] = count3++;
5029 o3 = decode[i] & 0x7F;
5030 i3 = 128*mlevel2[i2] + o3;
5031 mlevel3[i3] = i;
5032 }
5033 return result;
5034}
5035
5036static int
5037encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5038{
5039 struct encoding_map *map = (struct encoding_map*)mapping;
5040 int l1 = c>>11;
5041 int l2 = (c>>7) & 0xF;
5042 int l3 = c & 0x7F;
5043 int i;
5044
5045#ifdef Py_UNICODE_WIDE
5046 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005048 }
5049#endif
5050 if (c == 0)
5051 return 0;
5052 /* level 1*/
5053 i = map->level1[l1];
5054 if (i == 0xFF) {
5055 return -1;
5056 }
5057 /* level 2*/
5058 i = map->level23[16*i+l2];
5059 if (i == 0xFF) {
5060 return -1;
5061 }
5062 /* level 3 */
5063 i = map->level23[16*map->count2 + 128*i + l3];
5064 if (i == 0) {
5065 return -1;
5066 }
5067 return i;
5068}
5069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070/* Lookup the character ch in the mapping. If the character
5071 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005072 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074{
Christian Heimes217cfd12007-12-02 14:31:20 +00005075 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 PyObject *x;
5077
5078 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 x = PyObject_GetItem(mapping, w);
5081 Py_DECREF(w);
5082 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5084 /* No mapping found means: mapping is undefined. */
5085 PyErr_Clear();
5086 x = Py_None;
5087 Py_INCREF(x);
5088 return x;
5089 } else
5090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005092 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005094 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 long value = PyLong_AS_LONG(x);
5096 if (value < 0 || value > 255) {
5097 PyErr_SetString(PyExc_TypeError,
5098 "character mapping must be in range(256)");
5099 Py_DECREF(x);
5100 return NULL;
5101 }
5102 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005104 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 /* wrong return value */
5108 PyErr_Format(PyExc_TypeError,
5109 "character mapping must return integer, bytes or None, not %.400s",
5110 x->ob_type->tp_name);
5111 Py_DECREF(x);
5112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 }
5114}
5115
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005116static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005117charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005118{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005119 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5120 /* exponentially overallocate to minimize reallocations */
5121 if (requiredsize < 2*outsize)
5122 requiredsize = 2*outsize;
5123 if (_PyBytes_Resize(outobj, requiredsize))
5124 return -1;
5125 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005126}
5127
Benjamin Peterson14339b62009-01-31 16:36:08 +00005128typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005130}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005132 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005133 space is available. Return a new reference to the object that
5134 was put in the output buffer, or Py_None, if the mapping was undefined
5135 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005136 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005138charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005141 PyObject *rep;
5142 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005143 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144
Christian Heimes90aa7642007-12-19 02:45:37 +00005145 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005146 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005148 if (res == -1)
5149 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 if (outsize<requiredsize)
5151 if (charmapencode_resize(outobj, outpos, requiredsize))
5152 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005153 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 outstart[(*outpos)++] = (char)res;
5155 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005156 }
5157
5158 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 Py_DECREF(rep);
5163 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005164 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 if (PyLong_Check(rep)) {
5166 Py_ssize_t requiredsize = *outpos+1;
5167 if (outsize<requiredsize)
5168 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5169 Py_DECREF(rep);
5170 return enc_EXCEPTION;
5171 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005172 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 else {
5176 const char *repchars = PyBytes_AS_STRING(rep);
5177 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5178 Py_ssize_t requiredsize = *outpos+repsize;
5179 if (outsize<requiredsize)
5180 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5181 Py_DECREF(rep);
5182 return enc_EXCEPTION;
5183 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005184 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 memcpy(outstart + *outpos, repchars, repsize);
5186 *outpos += repsize;
5187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005188 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005189 Py_DECREF(rep);
5190 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191}
5192
5193/* handle an error in PyUnicode_EncodeCharmap
5194 Return 0 on success, -1 on error */
5195static
5196int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005199 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005200 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201{
5202 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t repsize;
5204 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 Py_UNICODE *uni2;
5206 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 Py_ssize_t collstartpos = *inpos;
5208 Py_ssize_t collendpos = *inpos+1;
5209 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 char *encoding = "charmap";
5211 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005212 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 /* find all unencodable characters */
5215 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005216 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005217 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 int res = encoding_map_lookup(p[collendpos], mapping);
5219 if (res != -1)
5220 break;
5221 ++collendpos;
5222 continue;
5223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005224
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 rep = charmapencode_lookup(p[collendpos], mapping);
5226 if (rep==NULL)
5227 return -1;
5228 else if (rep!=Py_None) {
5229 Py_DECREF(rep);
5230 break;
5231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005232 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 }
5235 /* cache callback name lookup
5236 * (if not done yet, i.e. it's the first error) */
5237 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 if ((errors==NULL) || (!strcmp(errors, "strict")))
5239 *known_errorHandler = 1;
5240 else if (!strcmp(errors, "replace"))
5241 *known_errorHandler = 2;
5242 else if (!strcmp(errors, "ignore"))
5243 *known_errorHandler = 3;
5244 else if (!strcmp(errors, "xmlcharrefreplace"))
5245 *known_errorHandler = 4;
5246 else
5247 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 }
5249 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005250 case 1: /* strict */
5251 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5252 return -1;
5253 case 2: /* replace */
5254 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 x = charmapencode_output('?', mapping, res, respos);
5256 if (x==enc_EXCEPTION) {
5257 return -1;
5258 }
5259 else if (x==enc_FAILED) {
5260 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5261 return -1;
5262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005263 }
5264 /* fall through */
5265 case 3: /* ignore */
5266 *inpos = collendpos;
5267 break;
5268 case 4: /* xmlcharrefreplace */
5269 /* generate replacement (temporarily (mis)uses p) */
5270 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 char buffer[2+29+1+1];
5272 char *cp;
5273 sprintf(buffer, "&#%d;", (int)p[collpos]);
5274 for (cp = buffer; *cp; ++cp) {
5275 x = charmapencode_output(*cp, mapping, res, respos);
5276 if (x==enc_EXCEPTION)
5277 return -1;
5278 else if (x==enc_FAILED) {
5279 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5280 return -1;
5281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005282 }
5283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005284 *inpos = collendpos;
5285 break;
5286 default:
5287 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 encoding, reason, p, size, exceptionObject,
5289 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005292 if (PyBytes_Check(repunicode)) {
5293 /* Directly copy bytes result to output. */
5294 Py_ssize_t outsize = PyBytes_Size(*res);
5295 Py_ssize_t requiredsize;
5296 repsize = PyBytes_Size(repunicode);
5297 requiredsize = *respos + repsize;
5298 if (requiredsize > outsize)
5299 /* Make room for all additional bytes. */
5300 if (charmapencode_resize(res, respos, requiredsize)) {
5301 Py_DECREF(repunicode);
5302 return -1;
5303 }
5304 memcpy(PyBytes_AsString(*res) + *respos,
5305 PyBytes_AsString(repunicode), repsize);
5306 *respos += repsize;
5307 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005308 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005309 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005311 /* generate replacement */
5312 repsize = PyUnicode_GET_SIZE(repunicode);
5313 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 x = charmapencode_output(*uni2, mapping, res, respos);
5315 if (x==enc_EXCEPTION) {
5316 return -1;
5317 }
5318 else if (x==enc_FAILED) {
5319 Py_DECREF(repunicode);
5320 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5321 return -1;
5322 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005323 }
5324 *inpos = newpos;
5325 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005326 }
5327 return 0;
5328}
5329
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 Py_ssize_t size,
5332 PyObject *mapping,
5333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335 /* output object */
5336 PyObject *res = NULL;
5337 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005339 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005340 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 PyObject *errorHandler = NULL;
5342 PyObject *exc = NULL;
5343 /* the following variable is used for caching string comparisons
5344 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5345 * 3=ignore, 4=xmlcharrefreplace */
5346 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
5348 /* Default to Latin-1 */
5349 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 /* allocate enough for a simple encoding without
5353 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005354 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 if (res == NULL)
5356 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005357 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 /* try to encode it */
5362 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5363 if (x==enc_EXCEPTION) /* error */
5364 goto onError;
5365 if (x==enc_FAILED) { /* unencodable character */
5366 if (charmap_encoding_error(p, size, &inpos, mapping,
5367 &exc,
5368 &known_errorHandler, &errorHandler, errors,
5369 &res, &respos)) {
5370 goto onError;
5371 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005372 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 else
5374 /* done with this character => adjust input position */
5375 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005379 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 if (_PyBytes_Resize(&res, respos) < 0)
5381 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 Py_XDECREF(exc);
5384 Py_XDECREF(errorHandler);
5385 return res;
5386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 Py_XDECREF(res);
5389 Py_XDECREF(exc);
5390 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 return NULL;
5392}
5393
5394PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
5397 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 PyErr_BadArgument();
5399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 }
5401 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 PyUnicode_GET_SIZE(unicode),
5403 mapping,
5404 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405}
5406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407/* create or adjust a UnicodeTranslateError */
5408static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 const Py_UNICODE *unicode, Py_ssize_t size,
5410 Py_ssize_t startpos, Py_ssize_t endpos,
5411 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005414 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 }
5417 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5419 goto onError;
5420 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5421 goto onError;
5422 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5423 goto onError;
5424 return;
5425 onError:
5426 Py_DECREF(*exceptionObject);
5427 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 }
5429}
5430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431/* raises a UnicodeTranslateError */
5432static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 const Py_UNICODE *unicode, Py_ssize_t size,
5434 Py_ssize_t startpos, Py_ssize_t endpos,
5435 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436{
5437 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441}
5442
5443/* error handling callback helper:
5444 build arguments, call the callback and check the arguments,
5445 put the result into newpos and return the replacement string, which
5446 has to be freed by the caller */
5447static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 PyObject **errorHandler,
5449 const char *reason,
5450 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5451 Py_ssize_t startpos, Py_ssize_t endpos,
5452 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005454 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005456 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 PyObject *restuple;
5458 PyObject *resunicode;
5459
5460 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 }
5465
5466 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470
5471 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005476 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 Py_DECREF(restuple);
5478 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 }
5480 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 &resunicode, &i_newpos)) {
5482 Py_DECREF(restuple);
5483 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005485 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 else
5488 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005489 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5491 Py_DECREF(restuple);
5492 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005493 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 Py_INCREF(resunicode);
5495 Py_DECREF(restuple);
5496 return resunicode;
5497}
5498
5499/* Lookup the character ch in the mapping and put the result in result,
5500 which must be decrefed by the caller.
5501 Return 0 on success, -1 on error */
5502static
5503int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5504{
Christian Heimes217cfd12007-12-02 14:31:20 +00005505 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 PyObject *x;
5507
5508 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 x = PyObject_GetItem(mapping, w);
5511 Py_DECREF(w);
5512 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5514 /* No mapping found means: use 1:1 mapping. */
5515 PyErr_Clear();
5516 *result = NULL;
5517 return 0;
5518 } else
5519 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 }
5521 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 *result = x;
5523 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005525 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 long value = PyLong_AS_LONG(x);
5527 long max = PyUnicode_GetMax();
5528 if (value < 0 || value > max) {
5529 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005530 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 Py_DECREF(x);
5532 return -1;
5533 }
5534 *result = x;
5535 return 0;
5536 }
5537 else if (PyUnicode_Check(x)) {
5538 *result = x;
5539 return 0;
5540 }
5541 else {
5542 /* wrong return value */
5543 PyErr_SetString(PyExc_TypeError,
5544 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005545 Py_DECREF(x);
5546 return -1;
5547 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548}
5549/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 if not reallocate and adjust various state variables.
5551 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552static
Walter Dörwald4894c302003-10-24 14:25:28 +00005553int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005557 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* remember old output position */
5559 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5560 /* exponentially overallocate to minimize reallocations */
5561 if (requiredsize < 2 * oldsize)
5562 requiredsize = 2 * oldsize;
5563 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5564 return -1;
5565 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 }
5567 return 0;
5568}
5569/* lookup the character, put the result in the output string and adjust
5570 various state variables. Return a new reference to the object that
5571 was put in the output buffer in *result, or Py_None, if the mapping was
5572 undefined (in which case no character was written).
5573 The called must decref result.
5574 Return 0 on success, -1 on error. */
5575static
Walter Dörwald4894c302003-10-24 14:25:28 +00005576int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5578 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579{
Walter Dörwald4894c302003-10-24 14:25:28 +00005580 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 /* not found => default to 1:1 mapping */
5584 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 }
5586 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005588 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 /* no overflow check, because we know that the space is enough */
5590 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 }
5592 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5594 if (repsize==1) {
5595 /* no overflow check, because we know that the space is enough */
5596 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5597 }
5598 else if (repsize!=0) {
5599 /* more than one character */
5600 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5601 (insize - (curinp-startinp)) +
5602 repsize - 1;
5603 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5604 return -1;
5605 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5606 *outp += repsize;
5607 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 }
5609 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 return 0;
5612}
5613
5614PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 Py_ssize_t size,
5616 PyObject *mapping,
5617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 /* output object */
5620 PyObject *res = NULL;
5621 /* pointers to the beginning and end+1 of input */
5622 const Py_UNICODE *startp = p;
5623 const Py_UNICODE *endp = p + size;
5624 /* pointer into the output */
5625 Py_UNICODE *str;
5626 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005627 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 char *reason = "character maps to <undefined>";
5629 PyObject *errorHandler = NULL;
5630 PyObject *exc = NULL;
5631 /* the following variable is used for caching string comparisons
5632 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5633 * 3=ignore, 4=xmlcharrefreplace */
5634 int known_errorHandler = -1;
5635
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 PyErr_BadArgument();
5638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640
5641 /* allocate enough for a simple 1:1 translation without
5642 replacements, if we need more, we'll resize */
5643 res = PyUnicode_FromUnicode(NULL, size);
5644 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 /* try to encode it */
5652 PyObject *x = NULL;
5653 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5654 Py_XDECREF(x);
5655 goto onError;
5656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005657 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 if (x!=Py_None) /* it worked => adjust input pointer */
5659 ++p;
5660 else { /* untranslatable character */
5661 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5662 Py_ssize_t repsize;
5663 Py_ssize_t newpos;
5664 Py_UNICODE *uni2;
5665 /* startpos for collecting untranslatable chars */
5666 const Py_UNICODE *collstart = p;
5667 const Py_UNICODE *collend = p+1;
5668 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 /* find all untranslatable characters */
5671 while (collend < endp) {
5672 if (charmaptranslate_lookup(*collend, mapping, &x))
5673 goto onError;
5674 Py_XDECREF(x);
5675 if (x!=Py_None)
5676 break;
5677 ++collend;
5678 }
5679 /* cache callback name lookup
5680 * (if not done yet, i.e. it's the first error) */
5681 if (known_errorHandler==-1) {
5682 if ((errors==NULL) || (!strcmp(errors, "strict")))
5683 known_errorHandler = 1;
5684 else if (!strcmp(errors, "replace"))
5685 known_errorHandler = 2;
5686 else if (!strcmp(errors, "ignore"))
5687 known_errorHandler = 3;
5688 else if (!strcmp(errors, "xmlcharrefreplace"))
5689 known_errorHandler = 4;
5690 else
5691 known_errorHandler = 0;
5692 }
5693 switch (known_errorHandler) {
5694 case 1: /* strict */
5695 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005696 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 case 2: /* replace */
5698 /* No need to check for space, this is a 1:1 replacement */
5699 for (coll = collstart; coll<collend; ++coll)
5700 *str++ = '?';
5701 /* fall through */
5702 case 3: /* ignore */
5703 p = collend;
5704 break;
5705 case 4: /* xmlcharrefreplace */
5706 /* generate replacement (temporarily (mis)uses p) */
5707 for (p = collstart; p < collend; ++p) {
5708 char buffer[2+29+1+1];
5709 char *cp;
5710 sprintf(buffer, "&#%d;", (int)*p);
5711 if (charmaptranslate_makespace(&res, &str,
5712 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5713 goto onError;
5714 for (cp = buffer; *cp; ++cp)
5715 *str++ = *cp;
5716 }
5717 p = collend;
5718 break;
5719 default:
5720 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5721 reason, startp, size, &exc,
5722 collstart-startp, collend-startp, &newpos);
5723 if (repunicode == NULL)
5724 goto onError;
5725 /* generate replacement */
5726 repsize = PyUnicode_GET_SIZE(repunicode);
5727 if (charmaptranslate_makespace(&res, &str,
5728 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5729 Py_DECREF(repunicode);
5730 goto onError;
5731 }
5732 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5733 *str++ = *uni2;
5734 p = startp + newpos;
5735 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005737 }
5738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 /* Resize if we allocated to much */
5740 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005741 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 if (PyUnicode_Resize(&res, respos) < 0)
5743 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 }
5745 Py_XDECREF(exc);
5746 Py_XDECREF(errorHandler);
5747 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 Py_XDECREF(res);
5751 Py_XDECREF(exc);
5752 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 return NULL;
5754}
5755
5756PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 PyObject *mapping,
5758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759{
5760 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 str = PyUnicode_FromObject(str);
5763 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 PyUnicode_GET_SIZE(str),
5767 mapping,
5768 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 Py_DECREF(str);
5770 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 Py_XDECREF(str);
5774 return NULL;
5775}
Tim Petersced69f82003-09-16 20:30:58 +00005776
Guido van Rossum9e896b32000-04-05 20:11:21 +00005777/* --- Decimal Encoder ---------------------------------------------------- */
5778
5779int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 Py_ssize_t length,
5781 char *output,
5782 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005783{
5784 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 PyObject *errorHandler = NULL;
5786 PyObject *exc = NULL;
5787 const char *encoding = "decimal";
5788 const char *reason = "invalid decimal Unicode string";
5789 /* the following variable is used for caching string comparisons
5790 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5791 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005792
5793 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 PyErr_BadArgument();
5795 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005796 }
5797
5798 p = s;
5799 end = s + length;
5800 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 register Py_UNICODE ch = *p;
5802 int decimal;
5803 PyObject *repunicode;
5804 Py_ssize_t repsize;
5805 Py_ssize_t newpos;
5806 Py_UNICODE *uni2;
5807 Py_UNICODE *collstart;
5808 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005809
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 ++p;
5813 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 decimal = Py_UNICODE_TODECIMAL(ch);
5816 if (decimal >= 0) {
5817 *output++ = '0' + decimal;
5818 ++p;
5819 continue;
5820 }
5821 if (0 < ch && ch < 256) {
5822 *output++ = (char)ch;
5823 ++p;
5824 continue;
5825 }
5826 /* All other characters are considered unencodable */
5827 collstart = p;
5828 collend = p+1;
5829 while (collend < end) {
5830 if ((0 < *collend && *collend < 256) ||
5831 !Py_UNICODE_ISSPACE(*collend) ||
5832 Py_UNICODE_TODECIMAL(*collend))
5833 break;
5834 }
5835 /* cache callback name lookup
5836 * (if not done yet, i.e. it's the first error) */
5837 if (known_errorHandler==-1) {
5838 if ((errors==NULL) || (!strcmp(errors, "strict")))
5839 known_errorHandler = 1;
5840 else if (!strcmp(errors, "replace"))
5841 known_errorHandler = 2;
5842 else if (!strcmp(errors, "ignore"))
5843 known_errorHandler = 3;
5844 else if (!strcmp(errors, "xmlcharrefreplace"))
5845 known_errorHandler = 4;
5846 else
5847 known_errorHandler = 0;
5848 }
5849 switch (known_errorHandler) {
5850 case 1: /* strict */
5851 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5852 goto onError;
5853 case 2: /* replace */
5854 for (p = collstart; p < collend; ++p)
5855 *output++ = '?';
5856 /* fall through */
5857 case 3: /* ignore */
5858 p = collend;
5859 break;
5860 case 4: /* xmlcharrefreplace */
5861 /* generate replacement (temporarily (mis)uses p) */
5862 for (p = collstart; p < collend; ++p)
5863 output += sprintf(output, "&#%d;", (int)*p);
5864 p = collend;
5865 break;
5866 default:
5867 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5868 encoding, reason, s, length, &exc,
5869 collstart-s, collend-s, &newpos);
5870 if (repunicode == NULL)
5871 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005872 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005873 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005874 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5875 Py_DECREF(repunicode);
5876 goto onError;
5877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* generate replacement */
5879 repsize = PyUnicode_GET_SIZE(repunicode);
5880 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5881 Py_UNICODE ch = *uni2;
5882 if (Py_UNICODE_ISSPACE(ch))
5883 *output++ = ' ';
5884 else {
5885 decimal = Py_UNICODE_TODECIMAL(ch);
5886 if (decimal >= 0)
5887 *output++ = '0' + decimal;
5888 else if (0 < ch && ch < 256)
5889 *output++ = (char)ch;
5890 else {
5891 Py_DECREF(repunicode);
5892 raise_encode_exception(&exc, encoding,
5893 s, length, collstart-s, collend-s, reason);
5894 goto onError;
5895 }
5896 }
5897 }
5898 p = s + newpos;
5899 Py_DECREF(repunicode);
5900 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005901 }
5902 /* 0-terminate the output string */
5903 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 Py_XDECREF(exc);
5905 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005906 return 0;
5907
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 Py_XDECREF(exc);
5910 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005911 return -1;
5912}
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914/* --- Helpers ------------------------------------------------------------ */
5915
Eric Smith8c663262007-08-25 02:26:07 +00005916#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005917#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005918
Thomas Wouters477c8d52006-05-27 19:21:47 +00005919#include "stringlib/count.h"
5920#include "stringlib/find.h"
5921#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005922#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005923
Eric Smith5807c412008-05-11 21:00:57 +00005924#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005925#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005926#include "stringlib/localeutil.h"
5927
Thomas Wouters477c8d52006-05-27 19:21:47 +00005928/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005929#define ADJUST_INDICES(start, end, len) \
5930 if (end > len) \
5931 end = len; \
5932 else if (end < 0) { \
5933 end += len; \
5934 if (end < 0) \
5935 end = 0; \
5936 } \
5937 if (start < 0) { \
5938 start += len; \
5939 if (start < 0) \
5940 start = 0; \
5941 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005942
Martin v. Löwis18e16552006-02-15 17:27:45 +00005943Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944 PyObject *substr,
5945 Py_ssize_t start,
5946 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005949 PyUnicodeObject* str_obj;
5950 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005951
Thomas Wouters477c8d52006-05-27 19:21:47 +00005952 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5953 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005955 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5956 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 Py_DECREF(str_obj);
5958 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 }
Tim Petersced69f82003-09-16 20:30:58 +00005960
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005961 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005963 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5964 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005965 );
5966
5967 Py_DECREF(sub_obj);
5968 Py_DECREF(str_obj);
5969
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 return result;
5971}
5972
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 PyObject *sub,
5975 Py_ssize_t start,
5976 Py_ssize_t end,
5977 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005982 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005984 sub = PyUnicode_FromObject(sub);
5985 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 Py_DECREF(str);
5987 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Tim Petersced69f82003-09-16 20:30:58 +00005989
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 if (direction > 0)
5991 result = stringlib_find_slice(
5992 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5993 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5994 start, end
5995 );
5996 else
5997 result = stringlib_rfind_slice(
5998 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5999 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6000 start, end
6001 );
6002
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 Py_DECREF(sub);
6005
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 return result;
6007}
6008
Tim Petersced69f82003-09-16 20:30:58 +00006009static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 PyUnicodeObject *substring,
6012 Py_ssize_t start,
6013 Py_ssize_t end,
6014 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 if (substring->length == 0)
6017 return 1;
6018
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006019 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 end -= substring->length;
6021 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 if (Py_UNICODE_MATCH(self, end, substring))
6026 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 } else {
6028 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 }
6031
6032 return 0;
6033}
6034
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 PyObject *substr,
6037 Py_ssize_t start,
6038 Py_ssize_t end,
6039 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006042
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 str = PyUnicode_FromObject(str);
6044 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 substr = PyUnicode_FromObject(substr);
6047 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 Py_DECREF(str);
6049 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 }
Tim Petersced69f82003-09-16 20:30:58 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 (PyUnicodeObject *)substr,
6054 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 Py_DECREF(str);
6056 Py_DECREF(substr);
6057 return result;
6058}
6059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060/* Apply fixfct filter to the Unicode object self and return a
6061 reference to the modified object */
6062
Tim Petersced69f82003-09-16 20:30:58 +00006063static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
6067
6068 PyUnicodeObject *u;
6069
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006070 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006073
6074 Py_UNICODE_COPY(u->str, self->str, self->length);
6075
Tim Peters7a29bd52001-09-12 03:03:31 +00006076 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 /* fixfct should return TRUE if it modified the buffer. If
6078 FALSE, return a reference to the original buffer instead
6079 (to save space, not time) */
6080 Py_INCREF(self);
6081 Py_DECREF(u);
6082 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 }
6084 return (PyObject*) u;
6085}
6086
Tim Petersced69f82003-09-16 20:30:58 +00006087static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088int fixupper(PyUnicodeObject *self)
6089{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006090 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 Py_UNICODE *s = self->str;
6092 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006093
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006096
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 ch = Py_UNICODE_TOUPPER(*s);
6098 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 *s = ch;
6101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 s++;
6103 }
6104
6105 return status;
6106}
6107
Tim Petersced69f82003-09-16 20:30:58 +00006108static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109int fixlower(PyUnicodeObject *self)
6110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006111 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 Py_UNICODE *s = self->str;
6113 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006114
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006117
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 ch = Py_UNICODE_TOLOWER(*s);
6119 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 *s = ch;
6122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 s++;
6124 }
6125
6126 return status;
6127}
6128
Tim Petersced69f82003-09-16 20:30:58 +00006129static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130int fixswapcase(PyUnicodeObject *self)
6131{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 Py_UNICODE *s = self->str;
6134 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 while (len-- > 0) {
6137 if (Py_UNICODE_ISUPPER(*s)) {
6138 *s = Py_UNICODE_TOLOWER(*s);
6139 status = 1;
6140 } else if (Py_UNICODE_ISLOWER(*s)) {
6141 *s = Py_UNICODE_TOUPPER(*s);
6142 status = 1;
6143 }
6144 s++;
6145 }
6146
6147 return status;
6148}
6149
Tim Petersced69f82003-09-16 20:30:58 +00006150static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151int fixcapitalize(PyUnicodeObject *self)
6152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006154 Py_UNICODE *s = self->str;
6155 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006156
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006157 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006159 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 *s = Py_UNICODE_TOUPPER(*s);
6161 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006163 s++;
6164 while (--len > 0) {
6165 if (Py_UNICODE_ISUPPER(*s)) {
6166 *s = Py_UNICODE_TOLOWER(*s);
6167 status = 1;
6168 }
6169 s++;
6170 }
6171 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172}
6173
6174static
6175int fixtitle(PyUnicodeObject *self)
6176{
6177 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6178 register Py_UNICODE *e;
6179 int previous_is_cased;
6180
6181 /* Shortcut for single character strings */
6182 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6184 if (*p != ch) {
6185 *p = ch;
6186 return 1;
6187 }
6188 else
6189 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 }
Tim Petersced69f82003-09-16 20:30:58 +00006191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 e = p + PyUnicode_GET_SIZE(self);
6193 previous_is_cased = 0;
6194 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 if (previous_is_cased)
6198 *p = Py_UNICODE_TOLOWER(ch);
6199 else
6200 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006201
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 if (Py_UNICODE_ISLOWER(ch) ||
6203 Py_UNICODE_ISUPPER(ch) ||
6204 Py_UNICODE_ISTITLE(ch))
6205 previous_is_cased = 1;
6206 else
6207 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 }
6209 return 1;
6210}
6211
Tim Peters8ce9f162004-08-27 01:49:32 +00006212PyObject *
6213PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
Skip Montanaro6543b452004-09-16 03:28:13 +00006215 const Py_UNICODE blank = ' ';
6216 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006218 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006219 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6220 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006221 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6222 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006223 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006224 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
Tim Peters05eba1f2004-08-27 21:32:02 +00006226 fseq = PySequence_Fast(seq, "");
6227 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006229 }
6230
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006231 /* NOTE: the following code can't call back into Python code,
6232 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006233 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006234
Tim Peters05eba1f2004-08-27 21:32:02 +00006235 seqlen = PySequence_Fast_GET_SIZE(fseq);
6236 /* If empty sequence, return u"". */
6237 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006238 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6239 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006240 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006241 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006242 /* If singleton sequence with an exact Unicode, return that. */
6243 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 item = items[0];
6245 if (PyUnicode_CheckExact(item)) {
6246 Py_INCREF(item);
6247 res = (PyUnicodeObject *)item;
6248 goto Done;
6249 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006250 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006251 else {
6252 /* Set up sep and seplen */
6253 if (separator == NULL) {
6254 sep = &blank;
6255 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006256 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006257 else {
6258 if (!PyUnicode_Check(separator)) {
6259 PyErr_Format(PyExc_TypeError,
6260 "separator: expected str instance,"
6261 " %.80s found",
6262 Py_TYPE(separator)->tp_name);
6263 goto onError;
6264 }
6265 sep = PyUnicode_AS_UNICODE(separator);
6266 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006267 }
6268 }
6269
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006270 /* There are at least two things to join, or else we have a subclass
6271 * of str in the sequence.
6272 * Do a pre-pass to figure out the total amount of space we'll
6273 * need (sz), and see whether all argument are strings.
6274 */
6275 sz = 0;
6276 for (i = 0; i < seqlen; i++) {
6277 const Py_ssize_t old_sz = sz;
6278 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 if (!PyUnicode_Check(item)) {
6280 PyErr_Format(PyExc_TypeError,
6281 "sequence item %zd: expected str instance,"
6282 " %.80s found",
6283 i, Py_TYPE(item)->tp_name);
6284 goto onError;
6285 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006286 sz += PyUnicode_GET_SIZE(item);
6287 if (i != 0)
6288 sz += seplen;
6289 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6290 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006292 goto onError;
6293 }
6294 }
Tim Petersced69f82003-09-16 20:30:58 +00006295
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006296 res = _PyUnicode_New(sz);
6297 if (res == NULL)
6298 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006299
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006300 /* Catenate everything. */
6301 res_p = PyUnicode_AS_UNICODE(res);
6302 for (i = 0; i < seqlen; ++i) {
6303 Py_ssize_t itemlen;
6304 item = items[i];
6305 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 /* Copy item, and maybe the separator. */
6307 if (i) {
6308 Py_UNICODE_COPY(res_p, sep, seplen);
6309 res_p += seplen;
6310 }
6311 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6312 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006313 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006314
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006316 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return (PyObject *)res;
6318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006320 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006321 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 return NULL;
6323}
6324
Tim Petersced69f82003-09-16 20:30:58 +00006325static
6326PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 Py_ssize_t left,
6328 Py_ssize_t right,
6329 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330{
6331 PyUnicodeObject *u;
6332
6333 if (left < 0)
6334 left = 0;
6335 if (right < 0)
6336 right = 0;
6337
Tim Peters7a29bd52001-09-12 03:03:31 +00006338 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 Py_INCREF(self);
6340 return self;
6341 }
6342
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006343 if (left > PY_SSIZE_T_MAX - self->length ||
6344 right > PY_SSIZE_T_MAX - (left + self->length)) {
6345 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6346 return NULL;
6347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 u = _PyUnicode_New(left + self->length + right);
6349 if (u) {
6350 if (left)
6351 Py_UNICODE_FILL(u->str, fill, left);
6352 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6353 if (right)
6354 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6355 }
6356
6357 return u;
6358}
6359
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006360PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 string = PyUnicode_FromObject(string);
6365 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006368 list = stringlib_splitlines(
6369 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6370 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372 Py_DECREF(string);
6373 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374}
6375
Tim Petersced69f82003-09-16 20:30:58 +00006376static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 PyUnicodeObject *substring,
6379 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006382 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006385 return stringlib_split_whitespace(
6386 (PyObject*) self, self->str, self->length, maxcount
6387 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006389 return stringlib_split(
6390 (PyObject*) self, self->str, self->length,
6391 substring->str, substring->length,
6392 maxcount
6393 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394}
6395
Tim Petersced69f82003-09-16 20:30:58 +00006396static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006397PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 PyUnicodeObject *substring,
6399 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006400{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006401 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006402 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006403
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006404 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006405 return stringlib_rsplit_whitespace(
6406 (PyObject*) self, self->str, self->length, maxcount
6407 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006408
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006409 return stringlib_rsplit(
6410 (PyObject*) self, self->str, self->length,
6411 substring->str, substring->length,
6412 maxcount
6413 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006414}
6415
6416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 PyUnicodeObject *str1,
6419 PyUnicodeObject *str2,
6420 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421{
6422 PyUnicodeObject *u;
6423
6424 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006426 else if (maxcount == 0 || self->length == 0)
6427 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006430 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006432 if (str1->length == 0)
6433 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434 if (str1->length == 1) {
6435 /* replace characters */
6436 Py_UNICODE u1, u2;
6437 if (!findchar(self->str, self->length, str1->str[0]))
6438 goto nothing;
6439 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6440 if (!u)
6441 return NULL;
6442 Py_UNICODE_COPY(u->str, self->str, self->length);
6443 u1 = str1->str[0];
6444 u2 = str2->str[0];
6445 for (i = 0; i < u->length; i++)
6446 if (u->str[i] == u1) {
6447 if (--maxcount < 0)
6448 break;
6449 u->str[i] = u2;
6450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006452 i = stringlib_find(
6453 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006455 if (i < 0)
6456 goto nothing;
6457 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6458 if (!u)
6459 return NULL;
6460 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006461
6462 /* change everything in-place, starting with this one */
6463 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6464 i += str1->length;
6465
6466 while ( --maxcount > 0) {
6467 i = stringlib_find(self->str+i, self->length-i,
6468 str1->str, str1->length,
6469 i);
6470 if (i == -1)
6471 break;
6472 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6473 i += str1->length;
6474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477
6478 Py_ssize_t n, i, j, e;
6479 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 Py_UNICODE *p;
6481
6482 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006483 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6484 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485 if (n == 0)
6486 goto nothing;
6487 /* new_size = self->length + n * (str2->length - str1->length)); */
6488 delta = (str2->length - str1->length);
6489 if (delta == 0) {
6490 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492 product = n * (str2->length - str1->length);
6493 if ((product / (str2->length - str1->length)) != n) {
6494 PyErr_SetString(PyExc_OverflowError,
6495 "replace string is too long");
6496 return NULL;
6497 }
6498 new_size = self->length + product;
6499 if (new_size < 0) {
6500 PyErr_SetString(PyExc_OverflowError,
6501 "replace string is too long");
6502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 }
6504 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006505 u = _PyUnicode_New(new_size);
6506 if (!u)
6507 return NULL;
6508 i = 0;
6509 p = u->str;
6510 e = self->length - str1->length;
6511 if (str1->length > 0) {
6512 while (n-- > 0) {
6513 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006514 j = stringlib_find(self->str+i, self->length-i,
6515 str1->str, str1->length,
6516 i);
6517 if (j == -1)
6518 break;
6519 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 /* copy unchanged part [i:j] */
6521 Py_UNICODE_COPY(p, self->str+i, j-i);
6522 p += j - i;
6523 }
6524 /* copy substitution string */
6525 if (str2->length > 0) {
6526 Py_UNICODE_COPY(p, str2->str, str2->length);
6527 p += str2->length;
6528 }
6529 i = j + str1->length;
6530 }
6531 if (i < self->length)
6532 /* copy tail [i:] */
6533 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6534 } else {
6535 /* interleave */
6536 while (n > 0) {
6537 Py_UNICODE_COPY(p, str2->str, str2->length);
6538 p += str2->length;
6539 if (--n <= 0)
6540 break;
6541 *p++ = self->str[i++];
6542 }
6543 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006547
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006549 /* nothing to replace; return original string (when possible) */
6550 if (PyUnicode_CheckExact(self)) {
6551 Py_INCREF(self);
6552 return (PyObject *) self;
6553 }
6554 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555}
6556
6557/* --- Unicode Object Methods --------------------------------------------- */
6558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561\n\
6562Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564
6565static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006566unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 return fixup(self, fixtitle);
6569}
6570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573\n\
6574Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006575have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
6577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006578unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 return fixup(self, fixcapitalize);
6581}
6582
6583#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586\n\
6587Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006591unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
6593 PyObject *list;
6594 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006595 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 /* Split into words */
6598 list = split(self, NULL, -1);
6599 if (!list)
6600 return NULL;
6601
6602 /* Capitalize each word */
6603 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6604 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 if (item == NULL)
6607 goto onError;
6608 Py_DECREF(PyList_GET_ITEM(list, i));
6609 PyList_SET_ITEM(list, i, item);
6610 }
6611
6612 /* Join the words to form a new string */
6613 item = PyUnicode_Join(NULL, list);
6614
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 Py_DECREF(list);
6617 return (PyObject *)item;
6618}
6619#endif
6620
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006621/* Argument converter. Coerces to a single unicode character */
6622
6623static int
6624convert_uc(PyObject *obj, void *addr)
6625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006626 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6627 PyObject *uniobj;
6628 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006629
Benjamin Peterson14339b62009-01-31 16:36:08 +00006630 uniobj = PyUnicode_FromObject(obj);
6631 if (uniobj == NULL) {
6632 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006634 return 0;
6635 }
6636 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6637 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006639 Py_DECREF(uniobj);
6640 return 0;
6641 }
6642 unistr = PyUnicode_AS_UNICODE(uniobj);
6643 *fillcharloc = unistr[0];
6644 Py_DECREF(uniobj);
6645 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006646}
6647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006651Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006652done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654static PyObject *
6655unicode_center(PyUnicodeObject *self, PyObject *args)
6656{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006657 Py_ssize_t marg, left;
6658 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006659 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660
Thomas Woutersde017742006-02-16 19:34:37 +00006661 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 return NULL;
6663
Tim Peters7a29bd52001-09-12 03:03:31 +00006664 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 Py_INCREF(self);
6666 return (PyObject*) self;
6667 }
6668
6669 marg = width - self->length;
6670 left = marg / 2 + (marg & width & 1);
6671
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006672 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Marc-André Lemburge5034372000-08-08 08:04:29 +00006675#if 0
6676
6677/* This code should go into some future Unicode collation support
6678 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006679 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006680
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006681/* speedy UTF-16 code point order comparison */
6682/* gleaned from: */
6683/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6684
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006685static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006686{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006687 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006688 0, 0, 0, 0, 0, 0, 0, 0,
6689 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006690 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006691};
6692
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693static int
6694unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 Py_UNICODE *s1 = str1->str;
6699 Py_UNICODE *s2 = str2->str;
6700
6701 len1 = str1->length;
6702 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006705 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006706
6707 c1 = *s1++;
6708 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006709
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 if (c1 > (1<<11) * 26)
6711 c1 += utf16Fixup[c1>>11];
6712 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006713 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006714 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006715
6716 if (c1 != c2)
6717 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006718
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006719 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 }
6721
6722 return (len1 < len2) ? -1 : (len1 != len2);
6723}
6724
Marc-André Lemburge5034372000-08-08 08:04:29 +00006725#else
6726
6727static int
6728unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006731
6732 Py_UNICODE *s1 = str1->str;
6733 Py_UNICODE *s2 = str2->str;
6734
6735 len1 = str1->length;
6736 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006737
Marc-André Lemburge5034372000-08-08 08:04:29 +00006738 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006739 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006740
Fredrik Lundh45714e92001-06-26 16:39:36 +00006741 c1 = *s1++;
6742 c2 = *s2++;
6743
6744 if (c1 != c2)
6745 return (c1 < c2) ? -1 : 1;
6746
Marc-André Lemburge5034372000-08-08 08:04:29 +00006747 len1--; len2--;
6748 }
6749
6750 return (len1 < len2) ? -1 : (len1 != len2);
6751}
6752
6753#endif
6754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006758 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6759 return unicode_compare((PyUnicodeObject *)left,
6760 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006761 PyErr_Format(PyExc_TypeError,
6762 "Can't compare %.100s and %.100s",
6763 left->ob_type->tp_name,
6764 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 return -1;
6766}
6767
Martin v. Löwis5b222132007-06-10 09:51:05 +00006768int
6769PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6770{
6771 int i;
6772 Py_UNICODE *id;
6773 assert(PyUnicode_Check(uni));
6774 id = PyUnicode_AS_UNICODE(uni);
6775 /* Compare Unicode string and source character set string */
6776 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 if (id[i] != str[i])
6778 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006779 /* This check keeps Python strings that end in '\0' from comparing equal
6780 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006781 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006783 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006785 return 0;
6786}
6787
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006788
Benjamin Peterson29060642009-01-31 22:14:21 +00006789#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006791
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006792PyObject *PyUnicode_RichCompare(PyObject *left,
6793 PyObject *right,
6794 int op)
6795{
6796 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006797
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006798 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6799 PyObject *v;
6800 if (((PyUnicodeObject *) left)->length !=
6801 ((PyUnicodeObject *) right)->length) {
6802 if (op == Py_EQ) {
6803 Py_INCREF(Py_False);
6804 return Py_False;
6805 }
6806 if (op == Py_NE) {
6807 Py_INCREF(Py_True);
6808 return Py_True;
6809 }
6810 }
6811 if (left == right)
6812 result = 0;
6813 else
6814 result = unicode_compare((PyUnicodeObject *)left,
6815 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006816
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006817 /* Convert the return value to a Boolean */
6818 switch (op) {
6819 case Py_EQ:
6820 v = TEST_COND(result == 0);
6821 break;
6822 case Py_NE:
6823 v = TEST_COND(result != 0);
6824 break;
6825 case Py_LE:
6826 v = TEST_COND(result <= 0);
6827 break;
6828 case Py_GE:
6829 v = TEST_COND(result >= 0);
6830 break;
6831 case Py_LT:
6832 v = TEST_COND(result == -1);
6833 break;
6834 case Py_GT:
6835 v = TEST_COND(result == 1);
6836 break;
6837 default:
6838 PyErr_BadArgument();
6839 return NULL;
6840 }
6841 Py_INCREF(v);
6842 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006843 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006844
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006845 Py_INCREF(Py_NotImplemented);
6846 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006847}
6848
Guido van Rossum403d68b2000-03-13 15:55:09 +00006849int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006851{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006852 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006853 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006854
6855 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006856 sub = PyUnicode_FromObject(element);
6857 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyErr_Format(PyExc_TypeError,
6859 "'in <string>' requires string as left operand, not %s",
6860 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006861 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006862 }
6863
Thomas Wouters477c8d52006-05-27 19:21:47 +00006864 str = PyUnicode_FromObject(container);
6865 if (!str) {
6866 Py_DECREF(sub);
6867 return -1;
6868 }
6869
6870 result = stringlib_contains_obj(str, sub);
6871
6872 Py_DECREF(str);
6873 Py_DECREF(sub);
6874
Guido van Rossum403d68b2000-03-13 15:55:09 +00006875 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006876}
6877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878/* Concat to string or Unicode object giving a new Unicode object. */
6879
6880PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882{
6883 PyUnicodeObject *u = NULL, *v = NULL, *w;
6884
6885 /* Coerce the two arguments */
6886 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6887 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6890 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
6893 /* Shortcuts */
6894 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 Py_DECREF(v);
6896 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 }
6898 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 Py_DECREF(u);
6900 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 }
6902
6903 /* Concat the two Unicode strings */
6904 w = _PyUnicode_New(u->length + v->length);
6905 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 Py_UNICODE_COPY(w->str, u->str, u->length);
6908 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6909
6910 Py_DECREF(u);
6911 Py_DECREF(v);
6912 return (PyObject *)w;
6913
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 Py_XDECREF(u);
6916 Py_XDECREF(v);
6917 return NULL;
6918}
6919
Walter Dörwald1ab83302007-05-18 17:15:44 +00006920void
6921PyUnicode_Append(PyObject **pleft, PyObject *right)
6922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923 PyObject *new;
6924 if (*pleft == NULL)
6925 return;
6926 if (right == NULL || !PyUnicode_Check(*pleft)) {
6927 Py_DECREF(*pleft);
6928 *pleft = NULL;
6929 return;
6930 }
6931 new = PyUnicode_Concat(*pleft, right);
6932 Py_DECREF(*pleft);
6933 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006934}
6935
6936void
6937PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6938{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939 PyUnicode_Append(pleft, right);
6940 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006941}
6942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006946Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006947string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject *
6951unicode_count(PyUnicodeObject *self, PyObject *args)
6952{
6953 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006954 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006955 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 PyObject *result;
6957
Guido van Rossumb8872e62000-05-09 14:14:27 +00006958 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return NULL;
6961
6962 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006963 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006966
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006967 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006968 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006969 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006970 substring->str, substring->length,
6971 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006972 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
6974 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 return result;
6977}
6978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006979PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006982Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006983to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006984handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6986'xmlcharrefreplace' as well as any other name registered with\n\
6987codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006990unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
Benjamin Peterson308d6372009-09-18 21:42:35 +00006992 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 char *encoding = NULL;
6994 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006995 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006996
Benjamin Peterson308d6372009-09-18 21:42:35 +00006997 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6998 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007000 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007001 if (v == NULL)
7002 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007003 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007004 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007005 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007006 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007007 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007008 Py_DECREF(v);
7009 return NULL;
7010 }
7011 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007012
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007014 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019\n\
7020Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject*
7024unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7025{
7026 Py_UNICODE *e;
7027 Py_UNICODE *p;
7028 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007029 Py_UNICODE *qe;
7030 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 PyUnicodeObject *u;
7032 int tabsize = 8;
7033
7034 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036
Thomas Wouters7e474022000-07-16 12:04:32 +00007037 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007038 i = 0; /* chars up to and including most recent \n or \r */
7039 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7040 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 for (p = self->str; p < e; p++)
7042 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 if (tabsize > 0) {
7044 incr = tabsize - (j % tabsize); /* cannot overflow */
7045 if (j > PY_SSIZE_T_MAX - incr)
7046 goto overflow1;
7047 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007048 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 if (j > PY_SSIZE_T_MAX - 1)
7052 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 j++;
7054 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 if (i > PY_SSIZE_T_MAX - j)
7056 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007058 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 }
7060 }
7061
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007062 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007064
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 /* Second pass: create output string and fill it */
7066 u = _PyUnicode_New(i + j);
7067 if (!u)
7068 return NULL;
7069
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007070 j = 0; /* same as in first pass */
7071 q = u->str; /* next output char */
7072 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
7074 for (p = self->str; p < e; p++)
7075 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 if (tabsize > 0) {
7077 i = tabsize - (j % tabsize);
7078 j += i;
7079 while (i--) {
7080 if (q >= qe)
7081 goto overflow2;
7082 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 else {
7087 if (q >= qe)
7088 goto overflow2;
7089 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007090 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 if (*p == '\n' || *p == '\r')
7092 j = 0;
7093 }
7094
7095 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007096
7097 overflow2:
7098 Py_DECREF(u);
7099 overflow1:
7100 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106\n\
7107Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007108such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109arguments start and end are interpreted as in slice notation.\n\
7110\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007111Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113static PyObject *
7114unicode_find(PyUnicodeObject *self, PyObject *args)
7115{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007116 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007117 Py_ssize_t start;
7118 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007119 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
Christian Heimes9cd17752007-11-18 19:35:23 +00007121 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 result = stringlib_find_slice(
7125 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7126 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7127 start, end
7128 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007131
Christian Heimes217cfd12007-12-02 14:31:20 +00007132 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133}
7134
7135static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137{
7138 if (index < 0 || index >= self->length) {
7139 PyErr_SetString(PyExc_IndexError, "string index out of range");
7140 return NULL;
7141 }
7142
7143 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7144}
7145
Guido van Rossumc2504932007-09-18 19:42:40 +00007146/* Believe it or not, this produces the same value for ASCII strings
7147 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007149unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
Guido van Rossumc2504932007-09-18 19:42:40 +00007151 Py_ssize_t len;
7152 Py_UNICODE *p;
7153 long x;
7154
7155 if (self->hash != -1)
7156 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007157 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007158 p = self->str;
7159 x = *p << 7;
7160 while (--len >= 0)
7161 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007162 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007163 if (x == -1)
7164 x = -2;
7165 self->hash = x;
7166 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167}
7168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007169PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173
7174static PyObject *
7175unicode_index(PyUnicodeObject *self, PyObject *args)
7176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007177 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007178 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007179 Py_ssize_t start;
7180 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
Christian Heimes9cd17752007-11-18 19:35:23 +00007182 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
Thomas Wouters477c8d52006-05-27 19:21:47 +00007185 result = stringlib_find_slice(
7186 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7187 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7188 start, end
7189 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
7191 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007192
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 if (result < 0) {
7194 PyErr_SetString(PyExc_ValueError, "substring not found");
7195 return NULL;
7196 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007197
Christian Heimes217cfd12007-12-02 14:31:20 +00007198 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199}
7200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007201PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206
7207static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007208unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
7210 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7211 register const Py_UNICODE *e;
7212 int cased;
7213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 /* Shortcut for single character strings */
7215 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007218 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007219 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007221
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 e = p + PyUnicode_GET_SIZE(self);
7223 cased = 0;
7224 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007226
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7228 return PyBool_FromLong(0);
7229 else if (!cased && Py_UNICODE_ISLOWER(ch))
7230 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007232 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233}
7234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007238Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007239at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240
7241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007242unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243{
7244 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7245 register const Py_UNICODE *e;
7246 int cased;
7247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 /* Shortcut for single character strings */
7249 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007252 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007253 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 e = p + PyUnicode_GET_SIZE(self);
7257 cased = 0;
7258 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007260
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7262 return PyBool_FromLong(0);
7263 else if (!cased && Py_UNICODE_ISUPPER(ch))
7264 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007266 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267}
7268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007269PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007272Return True if S is a titlecased string and there is at least one\n\
7273character in S, i.e. upper- and titlecase characters may only\n\
7274follow uncased characters and lowercase characters only cased ones.\n\
7275Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276
7277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007278unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279{
7280 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7281 register const Py_UNICODE *e;
7282 int cased, previous_is_cased;
7283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 /* Shortcut for single character strings */
7285 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7287 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007289 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007290 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 e = p + PyUnicode_GET_SIZE(self);
7294 cased = 0;
7295 previous_is_cased = 0;
7296 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007298
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7300 if (previous_is_cased)
7301 return PyBool_FromLong(0);
7302 previous_is_cased = 1;
7303 cased = 1;
7304 }
7305 else if (Py_UNICODE_ISLOWER(ch)) {
7306 if (!previous_is_cased)
7307 return PyBool_FromLong(0);
7308 previous_is_cased = 1;
7309 cased = 1;
7310 }
7311 else
7312 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007314 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007320Return True if all characters in S are whitespace\n\
7321and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
7323static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007324unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7327 register const Py_UNICODE *e;
7328
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 /* Shortcut for single character strings */
7330 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 Py_UNICODE_ISSPACE(*p))
7332 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007334 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007335 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 e = p + PyUnicode_GET_SIZE(self);
7339 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 if (!Py_UNICODE_ISSPACE(*p))
7341 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007343 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344}
7345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007346PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007349Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007350and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007351
7352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007353unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007354{
7355 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7356 register const Py_UNICODE *e;
7357
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007358 /* Shortcut for single character strings */
7359 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 Py_UNICODE_ISALPHA(*p))
7361 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362
7363 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007364 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366
7367 e = p + PyUnicode_GET_SIZE(self);
7368 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 if (!Py_UNICODE_ISALPHA(*p))
7370 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007371 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007372 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007373}
7374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007375PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007377\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007378Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007379and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007380
7381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007382unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007383{
7384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7385 register const Py_UNICODE *e;
7386
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007387 /* Shortcut for single character strings */
7388 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 Py_UNICODE_ISALNUM(*p))
7390 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391
7392 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007393 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395
7396 e = p + PyUnicode_GET_SIZE(self);
7397 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 if (!Py_UNICODE_ISALNUM(*p))
7399 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007400 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007401 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007402}
7403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007407Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007411unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412{
7413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7414 register const Py_UNICODE *e;
7415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 /* Shortcut for single character strings */
7417 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 Py_UNICODE_ISDECIMAL(*p))
7419 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007421 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007422 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 e = p + PyUnicode_GET_SIZE(self);
7426 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 if (!Py_UNICODE_ISDECIMAL(*p))
7428 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007430 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431}
7432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007433PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007436Return True if all characters in S are digits\n\
7437and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
7439static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007440unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441{
7442 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7443 register const Py_UNICODE *e;
7444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 /* Shortcut for single character strings */
7446 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 Py_UNICODE_ISDIGIT(*p))
7448 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007450 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007451 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 e = p + PyUnicode_GET_SIZE(self);
7455 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 if (!Py_UNICODE_ISDIGIT(*p))
7457 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007459 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460}
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007465Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007469unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
7471 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7472 register const Py_UNICODE *e;
7473
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 /* Shortcut for single character strings */
7475 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 Py_UNICODE_ISNUMERIC(*p))
7477 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007479 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007480 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007482
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 e = p + PyUnicode_GET_SIZE(self);
7484 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 if (!Py_UNICODE_ISNUMERIC(*p))
7486 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007488 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489}
7490
Martin v. Löwis47383402007-08-15 07:32:56 +00007491int
7492PyUnicode_IsIdentifier(PyObject *self)
7493{
7494 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7495 register const Py_UNICODE *e;
7496
7497 /* Special case for empty strings */
7498 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007500
7501 /* PEP 3131 says that the first character must be in
7502 XID_Start and subsequent characters in XID_Continue,
7503 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007504 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007505 letters, digits, underscore). However, given the current
7506 definition of XID_Start and XID_Continue, it is sufficient
7507 to check just for these, except that _ must be allowed
7508 as starting an identifier. */
7509 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7510 return 0;
7511
7512 e = p + PyUnicode_GET_SIZE(self);
7513 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 if (!_PyUnicode_IsXidContinue(*p))
7515 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007516 }
7517 return 1;
7518}
7519
7520PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007522\n\
7523Return True if S is a valid identifier according\n\
7524to the language definition.");
7525
7526static PyObject*
7527unicode_isidentifier(PyObject *self)
7528{
7529 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7530}
7531
Georg Brandl559e5d72008-06-11 18:37:52 +00007532PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007534\n\
7535Return True if all characters in S are considered\n\
7536printable in repr() or S is empty, False otherwise.");
7537
7538static PyObject*
7539unicode_isprintable(PyObject *self)
7540{
7541 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7542 register const Py_UNICODE *e;
7543
7544 /* Shortcut for single character strings */
7545 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7546 Py_RETURN_TRUE;
7547 }
7548
7549 e = p + PyUnicode_GET_SIZE(self);
7550 for (; p < e; p++) {
7551 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7552 Py_RETURN_FALSE;
7553 }
7554 }
7555 Py_RETURN_TRUE;
7556}
7557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007559 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560\n\
7561Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007562iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
7564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007565unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007567 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568}
7569
Martin v. Löwis18e16552006-02-15 17:27:45 +00007570static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571unicode_length(PyUnicodeObject *self)
7572{
7573 return self->length;
7574}
7575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007579Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007580done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
7582static PyObject *
7583unicode_ljust(PyUnicodeObject *self, PyObject *args)
7584{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007585 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007586 Py_UNICODE fillchar = ' ';
7587
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007588 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 return NULL;
7590
Tim Peters7a29bd52001-09-12 03:03:31 +00007591 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 Py_INCREF(self);
7593 return (PyObject*) self;
7594 }
7595
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007596 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597}
7598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
7604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007605unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return fixup(self, fixlower);
7608}
7609
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007610#define LEFTSTRIP 0
7611#define RIGHTSTRIP 1
7612#define BOTHSTRIP 2
7613
7614/* Arrays indexed by above */
7615static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7616
7617#define STRIPNAME(i) (stripformat[i]+3)
7618
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007619/* externally visible for str.strip(unicode) */
7620PyObject *
7621_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7622{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007623 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7624 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7625 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7626 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7627 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007628
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007630
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 i = 0;
7632 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7634 i++;
7635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007637
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638 j = len;
7639 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 do {
7641 j--;
7642 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7643 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007645
Benjamin Peterson14339b62009-01-31 16:36:08 +00007646 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 Py_INCREF(self);
7648 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007649 }
7650 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007652}
7653
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007656do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007658 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7659 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007660
Benjamin Peterson14339b62009-01-31 16:36:08 +00007661 i = 0;
7662 if (striptype != RIGHTSTRIP) {
7663 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7664 i++;
7665 }
7666 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007667
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 j = len;
7669 if (striptype != LEFTSTRIP) {
7670 do {
7671 j--;
7672 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7673 j++;
7674 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007675
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7677 Py_INCREF(self);
7678 return (PyObject*)self;
7679 }
7680 else
7681 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682}
7683
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007684
7685static PyObject *
7686do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007688 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7691 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007692
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693 if (sep != NULL && sep != Py_None) {
7694 if (PyUnicode_Check(sep))
7695 return _PyUnicode_XStrip(self, striptype, sep);
7696 else {
7697 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 "%s arg must be None or str",
7699 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007700 return NULL;
7701 }
7702 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007703
Benjamin Peterson14339b62009-01-31 16:36:08 +00007704 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705}
7706
7707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007710\n\
7711Return a copy of the string S with leading and trailing\n\
7712whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007713If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007714
7715static PyObject *
7716unicode_strip(PyUnicodeObject *self, PyObject *args)
7717{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 if (PyTuple_GET_SIZE(args) == 0)
7719 return do_strip(self, BOTHSTRIP); /* Common case */
7720 else
7721 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007722}
7723
7724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007727\n\
7728Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007729If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007730
7731static PyObject *
7732unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 if (PyTuple_GET_SIZE(args) == 0)
7735 return do_strip(self, LEFTSTRIP); /* Common case */
7736 else
7737 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007738}
7739
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007743\n\
7744Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007745If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007746
7747static PyObject *
7748unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7749{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 if (PyTuple_GET_SIZE(args) == 0)
7751 return do_strip(self, RIGHTSTRIP); /* Common case */
7752 else
7753 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007754}
7755
7756
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007758unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
7760 PyUnicodeObject *u;
7761 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007763 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
Georg Brandl222de0f2009-04-12 12:01:50 +00007765 if (len < 1) {
7766 Py_INCREF(unicode_empty);
7767 return (PyObject *)unicode_empty;
7768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
Tim Peters7a29bd52001-09-12 03:03:31 +00007770 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 /* no repeat, return original string */
7772 Py_INCREF(str);
7773 return (PyObject*) str;
7774 }
Tim Peters8f422462000-09-09 06:13:41 +00007775
7776 /* ensure # of chars needed doesn't overflow int and # of bytes
7777 * needed doesn't overflow size_t
7778 */
7779 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007780 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007781 PyErr_SetString(PyExc_OverflowError,
7782 "repeated string is too long");
7783 return NULL;
7784 }
7785 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7786 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7787 PyErr_SetString(PyExc_OverflowError,
7788 "repeated string is too long");
7789 return NULL;
7790 }
7791 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 if (!u)
7793 return NULL;
7794
7795 p = u->str;
7796
Georg Brandl222de0f2009-04-12 12:01:50 +00007797 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007798 Py_UNICODE_FILL(p, str->str[0], len);
7799 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007800 Py_ssize_t done = str->length; /* number of characters copied this far */
7801 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007803 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007804 Py_UNICODE_COPY(p+done, p, n);
7805 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 }
7808
7809 return (PyObject*) u;
7810}
7811
7812PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 PyObject *subobj,
7814 PyObject *replobj,
7815 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816{
7817 PyObject *self;
7818 PyObject *str1;
7819 PyObject *str2;
7820 PyObject *result;
7821
7822 self = PyUnicode_FromObject(obj);
7823 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 str1 = PyUnicode_FromObject(subobj);
7826 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 Py_DECREF(self);
7828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 }
7830 str2 = PyUnicode_FromObject(replobj);
7831 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 Py_DECREF(self);
7833 Py_DECREF(str1);
7834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 }
Tim Petersced69f82003-09-16 20:30:58 +00007836 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 (PyUnicodeObject *)str1,
7838 (PyUnicodeObject *)str2,
7839 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 Py_DECREF(self);
7841 Py_DECREF(str1);
7842 Py_DECREF(str2);
7843 return result;
7844}
7845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007846PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848\n\
7849Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007850old replaced by new. If the optional argument count is\n\
7851given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject*
7854unicode_replace(PyUnicodeObject *self, PyObject *args)
7855{
7856 PyUnicodeObject *str1;
7857 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007858 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 PyObject *result;
7860
Martin v. Löwis18e16552006-02-15 17:27:45 +00007861 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 return NULL;
7863 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7864 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007867 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 Py_DECREF(str1);
7869 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871
7872 result = replace(self, str1, str2, maxcount);
7873
7874 Py_DECREF(str1);
7875 Py_DECREF(str2);
7876 return result;
7877}
7878
7879static
7880PyObject *unicode_repr(PyObject *unicode)
7881{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007882 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007883 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007884 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7885 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7886
7887 /* XXX(nnorwitz): rather than over-allocating, it would be
7888 better to choose a different scheme. Perhaps scan the
7889 first N-chars of the string and allocate based on that size.
7890 */
7891 /* Initial allocation is based on the longest-possible unichr
7892 escape.
7893
7894 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7895 unichr, so in this case it's the longest unichr escape. In
7896 narrow (UTF-16) builds this is five chars per source unichr
7897 since there are two unichrs in the surrogate pair, so in narrow
7898 (UTF-16) builds it's not the longest unichr escape.
7899
7900 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7901 so in the narrow (UTF-16) build case it's the longest unichr
7902 escape.
7903 */
7904
Walter Dörwald1ab83302007-05-18 17:15:44 +00007905 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007907#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007909#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007911#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007913 if (repr == NULL)
7914 return NULL;
7915
Walter Dörwald1ab83302007-05-18 17:15:44 +00007916 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007917
7918 /* Add quote */
7919 *p++ = (findchar(s, size, '\'') &&
7920 !findchar(s, size, '"')) ? '"' : '\'';
7921 while (size-- > 0) {
7922 Py_UNICODE ch = *s++;
7923
7924 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007925 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007926 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007927 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007928 continue;
7929 }
7930
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007932 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007933 *p++ = '\\';
7934 *p++ = 't';
7935 }
7936 else if (ch == '\n') {
7937 *p++ = '\\';
7938 *p++ = 'n';
7939 }
7940 else if (ch == '\r') {
7941 *p++ = '\\';
7942 *p++ = 'r';
7943 }
7944
7945 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007946 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007947 *p++ = '\\';
7948 *p++ = 'x';
7949 *p++ = hexdigits[(ch >> 4) & 0x000F];
7950 *p++ = hexdigits[ch & 0x000F];
7951 }
7952
Georg Brandl559e5d72008-06-11 18:37:52 +00007953 /* Copy ASCII characters as-is */
7954 else if (ch < 0x7F) {
7955 *p++ = ch;
7956 }
7957
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007959 else {
7960 Py_UCS4 ucs = ch;
7961
7962#ifndef Py_UNICODE_WIDE
7963 Py_UNICODE ch2 = 0;
7964 /* Get code point from surrogate pair */
7965 if (size > 0) {
7966 ch2 = *s;
7967 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007969 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007971 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007972 size--;
7973 }
7974 }
7975#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007977 (categories Z* and C* except ASCII space)
7978 */
7979 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7980 /* Map 8-bit characters to '\xhh' */
7981 if (ucs <= 0xff) {
7982 *p++ = '\\';
7983 *p++ = 'x';
7984 *p++ = hexdigits[(ch >> 4) & 0x000F];
7985 *p++ = hexdigits[ch & 0x000F];
7986 }
7987 /* Map 21-bit characters to '\U00xxxxxx' */
7988 else if (ucs >= 0x10000) {
7989 *p++ = '\\';
7990 *p++ = 'U';
7991 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7992 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7993 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7994 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7995 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7996 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7997 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7998 *p++ = hexdigits[ucs & 0x0000000F];
7999 }
8000 /* Map 16-bit characters to '\uxxxx' */
8001 else {
8002 *p++ = '\\';
8003 *p++ = 'u';
8004 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8005 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8006 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8007 *p++ = hexdigits[ucs & 0x000F];
8008 }
8009 }
8010 /* Copy characters as-is */
8011 else {
8012 *p++ = ch;
8013#ifndef Py_UNICODE_WIDE
8014 if (ucs >= 0x10000)
8015 *p++ = ch2;
8016#endif
8017 }
8018 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008019 }
8020 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008021 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008022
8023 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008024 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008025 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026}
8027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008028PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030\n\
8031Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008032such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033arguments start and end are interpreted as in slice notation.\n\
8034\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008035Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036
8037static PyObject *
8038unicode_rfind(PyUnicodeObject *self, PyObject *args)
8039{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008040 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008041 Py_ssize_t start;
8042 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
Christian Heimes9cd17752007-11-18 19:35:23 +00008045 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 result = stringlib_rfind_slice(
8049 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8050 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8051 start, end
8052 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
8054 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008055
Christian Heimes217cfd12007-12-02 14:31:20 +00008056 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057}
8058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008059PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008062Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
8064static PyObject *
8065unicode_rindex(PyUnicodeObject *self, PyObject *args)
8066{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008067 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008068 Py_ssize_t start;
8069 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008070 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
Christian Heimes9cd17752007-11-18 19:35:23 +00008072 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075 result = stringlib_rfind_slice(
8076 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8077 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8078 start, end
8079 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080
8081 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 if (result < 0) {
8084 PyErr_SetString(PyExc_ValueError, "substring not found");
8085 return NULL;
8086 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008087 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088}
8089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008090PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008093Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008094done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095
8096static PyObject *
8097unicode_rjust(PyUnicodeObject *self, PyObject *args)
8098{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008099 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008100 Py_UNICODE fillchar = ' ';
8101
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008102 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 return NULL;
8104
Tim Peters7a29bd52001-09-12 03:03:31 +00008105 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 Py_INCREF(self);
8107 return (PyObject*) self;
8108 }
8109
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008110 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111}
8112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 PyObject *sep,
8115 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116{
8117 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008118
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 s = PyUnicode_FromObject(s);
8120 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 if (sep != NULL) {
8123 sep = PyUnicode_FromObject(sep);
8124 if (sep == NULL) {
8125 Py_DECREF(s);
8126 return NULL;
8127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 }
8129
8130 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8131
8132 Py_DECREF(s);
8133 Py_XDECREF(sep);
8134 return result;
8135}
8136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008137PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139\n\
8140Return a list of the words in S, using sep as the\n\
8141delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008142splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008143whitespace string is a separator and empty strings are\n\
8144removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
8146static PyObject*
8147unicode_split(PyUnicodeObject *self, PyObject *args)
8148{
8149 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008150 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151
Martin v. Löwis18e16552006-02-15 17:27:45 +00008152 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return NULL;
8154
8155 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161}
8162
Thomas Wouters477c8d52006-05-27 19:21:47 +00008163PyObject *
8164PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8165{
8166 PyObject* str_obj;
8167 PyObject* sep_obj;
8168 PyObject* out;
8169
8170 str_obj = PyUnicode_FromObject(str_in);
8171 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008173 sep_obj = PyUnicode_FromObject(sep_in);
8174 if (!sep_obj) {
8175 Py_DECREF(str_obj);
8176 return NULL;
8177 }
8178
8179 out = stringlib_partition(
8180 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8181 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8182 );
8183
8184 Py_DECREF(sep_obj);
8185 Py_DECREF(str_obj);
8186
8187 return out;
8188}
8189
8190
8191PyObject *
8192PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8193{
8194 PyObject* str_obj;
8195 PyObject* sep_obj;
8196 PyObject* out;
8197
8198 str_obj = PyUnicode_FromObject(str_in);
8199 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008201 sep_obj = PyUnicode_FromObject(sep_in);
8202 if (!sep_obj) {
8203 Py_DECREF(str_obj);
8204 return NULL;
8205 }
8206
8207 out = stringlib_rpartition(
8208 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8209 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8210 );
8211
8212 Py_DECREF(sep_obj);
8213 Py_DECREF(str_obj);
8214
8215 return out;
8216}
8217
8218PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008220\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008221Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008223found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008224
8225static PyObject*
8226unicode_partition(PyUnicodeObject *self, PyObject *separator)
8227{
8228 return PyUnicode_Partition((PyObject *)self, separator);
8229}
8230
8231PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008232 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008233\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008234Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008235the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008236separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008237
8238static PyObject*
8239unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8240{
8241 return PyUnicode_RPartition((PyObject *)self, separator);
8242}
8243
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008244PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 PyObject *sep,
8246 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008247{
8248 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008250 s = PyUnicode_FromObject(s);
8251 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008252 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 if (sep != NULL) {
8254 sep = PyUnicode_FromObject(sep);
8255 if (sep == NULL) {
8256 Py_DECREF(s);
8257 return NULL;
8258 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008259 }
8260
8261 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8262
8263 Py_DECREF(s);
8264 Py_XDECREF(sep);
8265 return result;
8266}
8267
8268PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008270\n\
8271Return a list of the words in S, using sep as the\n\
8272delimiter string, starting at the end of the string and\n\
8273working to the front. If maxsplit is given, at most maxsplit\n\
8274splits are done. If sep is not specified, any whitespace string\n\
8275is a separator.");
8276
8277static PyObject*
8278unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8279{
8280 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008281 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008282
Martin v. Löwis18e16552006-02-15 17:27:45 +00008283 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008284 return NULL;
8285
8286 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008288 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008290 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008292}
8293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008294PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296\n\
8297Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008298Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008299is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
8301static PyObject*
8302unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8303{
Guido van Rossum86662912000-04-11 15:38:46 +00008304 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
Guido van Rossum86662912000-04-11 15:38:46 +00008306 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return NULL;
8308
Guido van Rossum86662912000-04-11 15:38:46 +00008309 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310}
8311
8312static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008313PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314{
Walter Dörwald346737f2007-05-31 10:44:43 +00008315 if (PyUnicode_CheckExact(self)) {
8316 Py_INCREF(self);
8317 return self;
8318 } else
8319 /* Subtype -- return genuine unicode string with the same value. */
8320 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8321 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322}
8323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008324PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326\n\
8327Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008328and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329
8330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008331unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 return fixup(self, fixswapcase);
8334}
8335
Georg Brandlceee0772007-11-27 23:48:05 +00008336PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008338\n\
8339Return a translation table usable for str.translate().\n\
8340If there is only one argument, it must be a dictionary mapping Unicode\n\
8341ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008342Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008343If there are two arguments, they must be strings of equal length, and\n\
8344in the resulting dictionary, each character in x will be mapped to the\n\
8345character at the same position in y. If there is a third argument, it\n\
8346must be a string, whose characters will be mapped to None in the result.");
8347
8348static PyObject*
8349unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8350{
8351 PyObject *x, *y = NULL, *z = NULL;
8352 PyObject *new = NULL, *key, *value;
8353 Py_ssize_t i = 0;
8354 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355
Georg Brandlceee0772007-11-27 23:48:05 +00008356 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8357 return NULL;
8358 new = PyDict_New();
8359 if (!new)
8360 return NULL;
8361 if (y != NULL) {
8362 /* x must be a string too, of equal length */
8363 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8364 if (!PyUnicode_Check(x)) {
8365 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8366 "be a string if there is a second argument");
8367 goto err;
8368 }
8369 if (PyUnicode_GET_SIZE(x) != ylen) {
8370 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8371 "arguments must have equal length");
8372 goto err;
8373 }
8374 /* create entries for translating chars in x to those in y */
8375 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8377 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008378 if (!key || !value)
8379 goto err;
8380 res = PyDict_SetItem(new, key, value);
8381 Py_DECREF(key);
8382 Py_DECREF(value);
8383 if (res < 0)
8384 goto err;
8385 }
8386 /* create entries for deleting chars in z */
8387 if (z != NULL) {
8388 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008389 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008390 if (!key)
8391 goto err;
8392 res = PyDict_SetItem(new, key, Py_None);
8393 Py_DECREF(key);
8394 if (res < 0)
8395 goto err;
8396 }
8397 }
8398 } else {
8399 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008400 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008401 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8402 "to maketrans it must be a dict");
8403 goto err;
8404 }
8405 /* copy entries into the new dict, converting string keys to int keys */
8406 while (PyDict_Next(x, &i, &key, &value)) {
8407 if (PyUnicode_Check(key)) {
8408 /* convert string keys to integer keys */
8409 PyObject *newkey;
8410 if (PyUnicode_GET_SIZE(key) != 1) {
8411 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8412 "table must be of length 1");
8413 goto err;
8414 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008415 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008416 if (!newkey)
8417 goto err;
8418 res = PyDict_SetItem(new, newkey, value);
8419 Py_DECREF(newkey);
8420 if (res < 0)
8421 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008423 /* just keep integer keys */
8424 if (PyDict_SetItem(new, key, value) < 0)
8425 goto err;
8426 } else {
8427 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8428 "be strings or integers");
8429 goto err;
8430 }
8431 }
8432 }
8433 return new;
8434 err:
8435 Py_DECREF(new);
8436 return NULL;
8437}
8438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008439PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441\n\
8442Return a copy of the string S, where all characters have been mapped\n\
8443through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008444Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008445Unmapped characters are left untouched. Characters mapped to None\n\
8446are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447
8448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008449unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450{
Georg Brandlceee0772007-11-27 23:48:05 +00008451 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452}
8453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008454PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008457Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458
8459static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008460unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 return fixup(self, fixupper);
8463}
8464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008465PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008468Pad a numeric string S with zeros on the left, to fill a field\n\
8469of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
8471static PyObject *
8472unicode_zfill(PyUnicodeObject *self, PyObject *args)
8473{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008474 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 PyUnicodeObject *u;
8476
Martin v. Löwis18e16552006-02-15 17:27:45 +00008477 Py_ssize_t width;
8478 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 return NULL;
8480
8481 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008482 if (PyUnicode_CheckExact(self)) {
8483 Py_INCREF(self);
8484 return (PyObject*) self;
8485 }
8486 else
8487 return PyUnicode_FromUnicode(
8488 PyUnicode_AS_UNICODE(self),
8489 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 }
8492
8493 fill = width - self->length;
8494
8495 u = pad(self, fill, 0, '0');
8496
Walter Dörwald068325e2002-04-15 13:36:47 +00008497 if (u == NULL)
8498 return NULL;
8499
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 if (u->str[fill] == '+' || u->str[fill] == '-') {
8501 /* move sign to beginning of string */
8502 u->str[0] = u->str[fill];
8503 u->str[fill] = '0';
8504 }
8505
8506 return (PyObject*) u;
8507}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508
8509#if 0
8510static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008511unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512{
Christian Heimes2202f872008-02-06 14:31:34 +00008513 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514}
8515#endif
8516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008517PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008520Return True if S starts with the specified prefix, False otherwise.\n\
8521With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008522With optional end, stop comparing S at that position.\n\
8523prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
8525static PyObject *
8526unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008529 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008531 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008532 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008533 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008535 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8537 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008538 if (PyTuple_Check(subobj)) {
8539 Py_ssize_t i;
8540 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8541 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543 if (substring == NULL)
8544 return NULL;
8545 result = tailmatch(self, substring, start, end, -1);
8546 Py_DECREF(substring);
8547 if (result) {
8548 Py_RETURN_TRUE;
8549 }
8550 }
8551 /* nothing matched */
8552 Py_RETURN_FALSE;
8553 }
8554 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008557 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008559 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560}
8561
8562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008563PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008566Return True if S ends with the specified suffix, False otherwise.\n\
8567With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008568With optional end, stop comparing S at that position.\n\
8569suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570
8571static PyObject *
8572unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008577 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008578 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008579 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008581 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8583 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008584 if (PyTuple_Check(subobj)) {
8585 Py_ssize_t i;
8586 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8587 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008589 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008591 result = tailmatch(self, substring, start, end, +1);
8592 Py_DECREF(substring);
8593 if (result) {
8594 Py_RETURN_TRUE;
8595 }
8596 }
8597 Py_RETURN_FALSE;
8598 }
8599 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008603 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008605 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Eric Smith8c663262007-08-25 02:26:07 +00008608#include "stringlib/string_format.h"
8609
8610PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008612\n\
8613");
8614
Eric Smith4a7d76d2008-05-30 18:10:19 +00008615static PyObject *
8616unicode__format__(PyObject* self, PyObject* args)
8617{
8618 PyObject *format_spec;
8619
8620 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8621 return NULL;
8622
8623 return _PyUnicode_FormatAdvanced(self,
8624 PyUnicode_AS_UNICODE(format_spec),
8625 PyUnicode_GET_SIZE(format_spec));
8626}
8627
Eric Smith8c663262007-08-25 02:26:07 +00008628PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008630\n\
8631");
8632
8633static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008634unicode__sizeof__(PyUnicodeObject *v)
8635{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008636 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8637 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008638}
8639
8640PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008642
8643static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008644unicode_getnewargs(PyUnicodeObject *v)
8645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008646 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008647}
8648
8649
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650static PyMethodDef unicode_methods[] = {
8651
8652 /* Order is according to common usage: often used methods should
8653 appear first, since lookup is done sequentially. */
8654
Benjamin Peterson308d6372009-09-18 21:42:35 +00008655 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008656 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8657 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008658 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008659 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8660 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8661 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8662 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8663 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8664 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8665 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008666 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008667 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8668 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8669 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008670 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008671 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8672 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8673 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008674 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008675 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008676 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008677 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008678 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8679 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8680 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8681 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8682 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8683 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8684 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8685 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8686 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8687 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8688 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8689 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8690 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8691 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008692 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008693 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008694 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008695 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008696 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008697 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8698 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008699 {"maketrans", (PyCFunction) unicode_maketrans,
8700 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008701 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008702#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008703 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704#endif
8705
8706#if 0
8707 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008708 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709#endif
8710
Benjamin Peterson14339b62009-01-31 16:36:08 +00008711 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 {NULL, NULL}
8713};
8714
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008715static PyObject *
8716unicode_mod(PyObject *v, PyObject *w)
8717{
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 if (!PyUnicode_Check(v)) {
8719 Py_INCREF(Py_NotImplemented);
8720 return Py_NotImplemented;
8721 }
8722 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008723}
8724
8725static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 0, /*nb_add*/
8727 0, /*nb_subtract*/
8728 0, /*nb_multiply*/
8729 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008730};
8731
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 (lenfunc) unicode_length, /* sq_length */
8734 PyUnicode_Concat, /* sq_concat */
8735 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8736 (ssizeargfunc) unicode_getitem, /* sq_item */
8737 0, /* sq_slice */
8738 0, /* sq_ass_item */
8739 0, /* sq_ass_slice */
8740 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741};
8742
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008743static PyObject*
8744unicode_subscript(PyUnicodeObject* self, PyObject* item)
8745{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008746 if (PyIndex_Check(item)) {
8747 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008748 if (i == -1 && PyErr_Occurred())
8749 return NULL;
8750 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008751 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008752 return unicode_getitem(self, i);
8753 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008754 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008755 Py_UNICODE* source_buf;
8756 Py_UNICODE* result_buf;
8757 PyObject* result;
8758
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008759 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008761 return NULL;
8762 }
8763
8764 if (slicelength <= 0) {
8765 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008766 } else if (start == 0 && step == 1 && slicelength == self->length &&
8767 PyUnicode_CheckExact(self)) {
8768 Py_INCREF(self);
8769 return (PyObject *)self;
8770 } else if (step == 1) {
8771 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008772 } else {
8773 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008774 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8775 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008776
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 if (result_buf == NULL)
8778 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008779
8780 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8781 result_buf[i] = source_buf[cur];
8782 }
Tim Petersced69f82003-09-16 20:30:58 +00008783
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008784 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008785 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008786 return result;
8787 }
8788 } else {
8789 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8790 return NULL;
8791 }
8792}
8793
8794static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 (lenfunc)unicode_length, /* mp_length */
8796 (binaryfunc)unicode_subscript, /* mp_subscript */
8797 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008798};
8799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801/* Helpers for PyUnicode_Format() */
8802
8803static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008804getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008806 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 (*p_argidx)++;
8809 if (arglen < 0)
8810 return args;
8811 else
8812 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 }
8814 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 return NULL;
8817}
8818
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008819/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008821static PyObject *
8822formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008824 char *p;
8825 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008827
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 x = PyFloat_AsDouble(v);
8829 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008830 return NULL;
8831
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008834
Eric Smith0923d1d2009-04-16 20:16:10 +00008835 p = PyOS_double_to_string(x, type, prec,
8836 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008837 if (p == NULL)
8838 return NULL;
8839 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008840 PyMem_Free(p);
8841 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842}
8843
Tim Peters38fd5b62000-09-21 05:43:11 +00008844static PyObject*
8845formatlong(PyObject *val, int flags, int prec, int type)
8846{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008847 char *buf;
8848 int len;
8849 PyObject *str; /* temporary string object. */
8850 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008851
Benjamin Peterson14339b62009-01-31 16:36:08 +00008852 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8853 if (!str)
8854 return NULL;
8855 result = PyUnicode_FromStringAndSize(buf, len);
8856 Py_DECREF(str);
8857 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008858}
8859
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860static int
8861formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008862 size_t buflen,
8863 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008865 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008866 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 if (PyUnicode_GET_SIZE(v) == 1) {
8868 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8869 buf[1] = '\0';
8870 return 1;
8871 }
8872#ifndef Py_UNICODE_WIDE
8873 if (PyUnicode_GET_SIZE(v) == 2) {
8874 /* Decode a valid surrogate pair */
8875 int c0 = PyUnicode_AS_UNICODE(v)[0];
8876 int c1 = PyUnicode_AS_UNICODE(v)[1];
8877 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8878 0xDC00 <= c1 && c1 <= 0xDFFF) {
8879 buf[0] = c0;
8880 buf[1] = c1;
8881 buf[2] = '\0';
8882 return 2;
8883 }
8884 }
8885#endif
8886 goto onError;
8887 }
8888 else {
8889 /* Integer input truncated to a character */
8890 long x;
8891 x = PyLong_AsLong(v);
8892 if (x == -1 && PyErr_Occurred())
8893 goto onError;
8894
8895 if (x < 0 || x > 0x10ffff) {
8896 PyErr_SetString(PyExc_OverflowError,
8897 "%c arg not in range(0x110000)");
8898 return -1;
8899 }
8900
8901#ifndef Py_UNICODE_WIDE
8902 if (x > 0xffff) {
8903 x -= 0x10000;
8904 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8905 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8906 return 2;
8907 }
8908#endif
8909 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 buf[1] = '\0';
8911 return 1;
8912 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008913
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008915 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008917 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918}
8919
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008920/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008921 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008922*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008923#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008924
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
8928 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008929 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 int args_owned = 0;
8931 PyUnicodeObject *result = NULL;
8932 PyObject *dict = NULL;
8933 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008934
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 PyErr_BadInternalCall();
8937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 }
8939 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008940 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 fmt = PyUnicode_AS_UNICODE(uformat);
8943 fmtcnt = PyUnicode_GET_SIZE(uformat);
8944
8945 reslen = rescnt = fmtcnt + 100;
8946 result = _PyUnicode_New(reslen);
8947 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 res = PyUnicode_AS_UNICODE(result);
8950
8951 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 arglen = PyTuple_Size(args);
8953 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 }
8955 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 arglen = -1;
8957 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008959 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008960 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
8963 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 if (*fmt != '%') {
8965 if (--rescnt < 0) {
8966 rescnt = fmtcnt + 100;
8967 reslen += rescnt;
8968 if (_PyUnicode_Resize(&result, reslen) < 0)
8969 goto onError;
8970 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8971 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
8975 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 /* Got a format specifier */
8977 int flags = 0;
8978 Py_ssize_t width = -1;
8979 int prec = -1;
8980 Py_UNICODE c = '\0';
8981 Py_UNICODE fill;
8982 int isnumok;
8983 PyObject *v = NULL;
8984 PyObject *temp = NULL;
8985 Py_UNICODE *pbuf;
8986 Py_UNICODE sign;
8987 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008988 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 fmt++;
8991 if (*fmt == '(') {
8992 Py_UNICODE *keystart;
8993 Py_ssize_t keylen;
8994 PyObject *key;
8995 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00008996
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 if (dict == NULL) {
8998 PyErr_SetString(PyExc_TypeError,
8999 "format requires a mapping");
9000 goto onError;
9001 }
9002 ++fmt;
9003 --fmtcnt;
9004 keystart = fmt;
9005 /* Skip over balanced parentheses */
9006 while (pcount > 0 && --fmtcnt >= 0) {
9007 if (*fmt == ')')
9008 --pcount;
9009 else if (*fmt == '(')
9010 ++pcount;
9011 fmt++;
9012 }
9013 keylen = fmt - keystart - 1;
9014 if (fmtcnt < 0 || pcount > 0) {
9015 PyErr_SetString(PyExc_ValueError,
9016 "incomplete format key");
9017 goto onError;
9018 }
9019#if 0
9020 /* keys are converted to strings using UTF-8 and
9021 then looked up since Python uses strings to hold
9022 variables names etc. in its namespaces and we
9023 wouldn't want to break common idioms. */
9024 key = PyUnicode_EncodeUTF8(keystart,
9025 keylen,
9026 NULL);
9027#else
9028 key = PyUnicode_FromUnicode(keystart, keylen);
9029#endif
9030 if (key == NULL)
9031 goto onError;
9032 if (args_owned) {
9033 Py_DECREF(args);
9034 args_owned = 0;
9035 }
9036 args = PyObject_GetItem(dict, key);
9037 Py_DECREF(key);
9038 if (args == NULL) {
9039 goto onError;
9040 }
9041 args_owned = 1;
9042 arglen = -1;
9043 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009044 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 while (--fmtcnt >= 0) {
9046 switch (c = *fmt++) {
9047 case '-': flags |= F_LJUST; continue;
9048 case '+': flags |= F_SIGN; continue;
9049 case ' ': flags |= F_BLANK; continue;
9050 case '#': flags |= F_ALT; continue;
9051 case '0': flags |= F_ZERO; continue;
9052 }
9053 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009054 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 if (c == '*') {
9056 v = getnextarg(args, arglen, &argidx);
9057 if (v == NULL)
9058 goto onError;
9059 if (!PyLong_Check(v)) {
9060 PyErr_SetString(PyExc_TypeError,
9061 "* wants int");
9062 goto onError;
9063 }
9064 width = PyLong_AsLong(v);
9065 if (width == -1 && PyErr_Occurred())
9066 goto onError;
9067 if (width < 0) {
9068 flags |= F_LJUST;
9069 width = -width;
9070 }
9071 if (--fmtcnt >= 0)
9072 c = *fmt++;
9073 }
9074 else if (c >= '0' && c <= '9') {
9075 width = c - '0';
9076 while (--fmtcnt >= 0) {
9077 c = *fmt++;
9078 if (c < '0' || c > '9')
9079 break;
9080 if ((width*10) / 10 != width) {
9081 PyErr_SetString(PyExc_ValueError,
9082 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009083 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 }
9085 width = width*10 + (c - '0');
9086 }
9087 }
9088 if (c == '.') {
9089 prec = 0;
9090 if (--fmtcnt >= 0)
9091 c = *fmt++;
9092 if (c == '*') {
9093 v = getnextarg(args, arglen, &argidx);
9094 if (v == NULL)
9095 goto onError;
9096 if (!PyLong_Check(v)) {
9097 PyErr_SetString(PyExc_TypeError,
9098 "* wants int");
9099 goto onError;
9100 }
9101 prec = PyLong_AsLong(v);
9102 if (prec == -1 && PyErr_Occurred())
9103 goto onError;
9104 if (prec < 0)
9105 prec = 0;
9106 if (--fmtcnt >= 0)
9107 c = *fmt++;
9108 }
9109 else if (c >= '0' && c <= '9') {
9110 prec = c - '0';
9111 while (--fmtcnt >= 0) {
9112 c = Py_CHARMASK(*fmt++);
9113 if (c < '0' || c > '9')
9114 break;
9115 if ((prec*10) / 10 != prec) {
9116 PyErr_SetString(PyExc_ValueError,
9117 "prec too big");
9118 goto onError;
9119 }
9120 prec = prec*10 + (c - '0');
9121 }
9122 }
9123 } /* prec */
9124 if (fmtcnt >= 0) {
9125 if (c == 'h' || c == 'l' || c == 'L') {
9126 if (--fmtcnt >= 0)
9127 c = *fmt++;
9128 }
9129 }
9130 if (fmtcnt < 0) {
9131 PyErr_SetString(PyExc_ValueError,
9132 "incomplete format");
9133 goto onError;
9134 }
9135 if (c != '%') {
9136 v = getnextarg(args, arglen, &argidx);
9137 if (v == NULL)
9138 goto onError;
9139 }
9140 sign = 0;
9141 fill = ' ';
9142 switch (c) {
9143
9144 case '%':
9145 pbuf = formatbuf;
9146 /* presume that buffer length is at least 1 */
9147 pbuf[0] = '%';
9148 len = 1;
9149 break;
9150
9151 case 's':
9152 case 'r':
9153 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009154 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 temp = v;
9156 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009157 }
9158 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 if (c == 's')
9160 temp = PyObject_Str(v);
9161 else if (c == 'r')
9162 temp = PyObject_Repr(v);
9163 else
9164 temp = PyObject_ASCII(v);
9165 if (temp == NULL)
9166 goto onError;
9167 if (PyUnicode_Check(temp))
9168 /* nothing to do */;
9169 else {
9170 Py_DECREF(temp);
9171 PyErr_SetString(PyExc_TypeError,
9172 "%s argument has non-string str()");
9173 goto onError;
9174 }
9175 }
9176 pbuf = PyUnicode_AS_UNICODE(temp);
9177 len = PyUnicode_GET_SIZE(temp);
9178 if (prec >= 0 && len > prec)
9179 len = prec;
9180 break;
9181
9182 case 'i':
9183 case 'd':
9184 case 'u':
9185 case 'o':
9186 case 'x':
9187 case 'X':
9188 if (c == 'i')
9189 c = 'd';
9190 isnumok = 0;
9191 if (PyNumber_Check(v)) {
9192 PyObject *iobj=NULL;
9193
9194 if (PyLong_Check(v)) {
9195 iobj = v;
9196 Py_INCREF(iobj);
9197 }
9198 else {
9199 iobj = PyNumber_Long(v);
9200 }
9201 if (iobj!=NULL) {
9202 if (PyLong_Check(iobj)) {
9203 isnumok = 1;
9204 temp = formatlong(iobj, flags, prec, c);
9205 Py_DECREF(iobj);
9206 if (!temp)
9207 goto onError;
9208 pbuf = PyUnicode_AS_UNICODE(temp);
9209 len = PyUnicode_GET_SIZE(temp);
9210 sign = 1;
9211 }
9212 else {
9213 Py_DECREF(iobj);
9214 }
9215 }
9216 }
9217 if (!isnumok) {
9218 PyErr_Format(PyExc_TypeError,
9219 "%%%c format: a number is required, "
9220 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9221 goto onError;
9222 }
9223 if (flags & F_ZERO)
9224 fill = '0';
9225 break;
9226
9227 case 'e':
9228 case 'E':
9229 case 'f':
9230 case 'F':
9231 case 'g':
9232 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009233 temp = formatfloat(v, flags, prec, c);
9234 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009236 pbuf = PyUnicode_AS_UNICODE(temp);
9237 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 sign = 1;
9239 if (flags & F_ZERO)
9240 fill = '0';
9241 break;
9242
9243 case 'c':
9244 pbuf = formatbuf;
9245 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9246 if (len < 0)
9247 goto onError;
9248 break;
9249
9250 default:
9251 PyErr_Format(PyExc_ValueError,
9252 "unsupported format character '%c' (0x%x) "
9253 "at index %zd",
9254 (31<=c && c<=126) ? (char)c : '?',
9255 (int)c,
9256 (Py_ssize_t)(fmt - 1 -
9257 PyUnicode_AS_UNICODE(uformat)));
9258 goto onError;
9259 }
9260 if (sign) {
9261 if (*pbuf == '-' || *pbuf == '+') {
9262 sign = *pbuf++;
9263 len--;
9264 }
9265 else if (flags & F_SIGN)
9266 sign = '+';
9267 else if (flags & F_BLANK)
9268 sign = ' ';
9269 else
9270 sign = 0;
9271 }
9272 if (width < len)
9273 width = len;
9274 if (rescnt - (sign != 0) < width) {
9275 reslen -= rescnt;
9276 rescnt = width + fmtcnt + 100;
9277 reslen += rescnt;
9278 if (reslen < 0) {
9279 Py_XDECREF(temp);
9280 PyErr_NoMemory();
9281 goto onError;
9282 }
9283 if (_PyUnicode_Resize(&result, reslen) < 0) {
9284 Py_XDECREF(temp);
9285 goto onError;
9286 }
9287 res = PyUnicode_AS_UNICODE(result)
9288 + reslen - rescnt;
9289 }
9290 if (sign) {
9291 if (fill != ' ')
9292 *res++ = sign;
9293 rescnt--;
9294 if (width > len)
9295 width--;
9296 }
9297 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9298 assert(pbuf[0] == '0');
9299 assert(pbuf[1] == c);
9300 if (fill != ' ') {
9301 *res++ = *pbuf++;
9302 *res++ = *pbuf++;
9303 }
9304 rescnt -= 2;
9305 width -= 2;
9306 if (width < 0)
9307 width = 0;
9308 len -= 2;
9309 }
9310 if (width > len && !(flags & F_LJUST)) {
9311 do {
9312 --rescnt;
9313 *res++ = fill;
9314 } while (--width > len);
9315 }
9316 if (fill == ' ') {
9317 if (sign)
9318 *res++ = sign;
9319 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9320 assert(pbuf[0] == '0');
9321 assert(pbuf[1] == c);
9322 *res++ = *pbuf++;
9323 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009324 }
9325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 Py_UNICODE_COPY(res, pbuf, len);
9327 res += len;
9328 rescnt -= len;
9329 while (--width >= len) {
9330 --rescnt;
9331 *res++ = ' ';
9332 }
9333 if (dict && (argidx < arglen) && c != '%') {
9334 PyErr_SetString(PyExc_TypeError,
9335 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009336 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 goto onError;
9338 }
9339 Py_XDECREF(temp);
9340 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 } /* until end */
9342 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 PyErr_SetString(PyExc_TypeError,
9344 "not all arguments converted during string formatting");
9345 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 }
9347
Thomas Woutersa96affe2006-03-12 00:29:36 +00009348 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
9353 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 return (PyObject *)result;
9355
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 Py_XDECREF(result);
9358 Py_DECREF(uformat);
9359 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
9362 return NULL;
9363}
9364
Jeremy Hylton938ace62002-07-17 16:30:39 +00009365static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009366unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9367
Tim Peters6d6c1a32001-08-02 04:15:00 +00009368static PyObject *
9369unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9370{
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009372 static char *kwlist[] = {"object", "encoding", "errors", 0};
9373 char *encoding = NULL;
9374 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009375
Benjamin Peterson14339b62009-01-31 16:36:08 +00009376 if (type != &PyUnicode_Type)
9377 return unicode_subtype_new(type, args, kwds);
9378 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380 return NULL;
9381 if (x == NULL)
9382 return (PyObject *)_PyUnicode_New(0);
9383 if (encoding == NULL && errors == NULL)
9384 return PyObject_Str(x);
9385 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009387}
9388
Guido van Rossume023fe02001-08-30 03:12:59 +00009389static PyObject *
9390unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9391{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009392 PyUnicodeObject *tmp, *pnew;
9393 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009394
Benjamin Peterson14339b62009-01-31 16:36:08 +00009395 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9396 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9397 if (tmp == NULL)
9398 return NULL;
9399 assert(PyUnicode_Check(tmp));
9400 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9401 if (pnew == NULL) {
9402 Py_DECREF(tmp);
9403 return NULL;
9404 }
9405 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9406 if (pnew->str == NULL) {
9407 _Py_ForgetReference((PyObject *)pnew);
9408 PyObject_Del(pnew);
9409 Py_DECREF(tmp);
9410 return PyErr_NoMemory();
9411 }
9412 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9413 pnew->length = n;
9414 pnew->hash = tmp->hash;
9415 Py_DECREF(tmp);
9416 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009417}
9418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009419PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009420 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009421\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009422Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009423encoding defaults to the current default string encoding.\n\
9424errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009425
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009426static PyObject *unicode_iter(PyObject *seq);
9427
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009429 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009430 "str", /* tp_name */
9431 sizeof(PyUnicodeObject), /* tp_size */
9432 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009434 (destructor)unicode_dealloc, /* tp_dealloc */
9435 0, /* tp_print */
9436 0, /* tp_getattr */
9437 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009438 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009439 unicode_repr, /* tp_repr */
9440 &unicode_as_number, /* tp_as_number */
9441 &unicode_as_sequence, /* tp_as_sequence */
9442 &unicode_as_mapping, /* tp_as_mapping */
9443 (hashfunc) unicode_hash, /* tp_hash*/
9444 0, /* tp_call*/
9445 (reprfunc) unicode_str, /* tp_str */
9446 PyObject_GenericGetAttr, /* tp_getattro */
9447 0, /* tp_setattro */
9448 0, /* tp_as_buffer */
9449 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009451 unicode_doc, /* tp_doc */
9452 0, /* tp_traverse */
9453 0, /* tp_clear */
9454 PyUnicode_RichCompare, /* tp_richcompare */
9455 0, /* tp_weaklistoffset */
9456 unicode_iter, /* tp_iter */
9457 0, /* tp_iternext */
9458 unicode_methods, /* tp_methods */
9459 0, /* tp_members */
9460 0, /* tp_getset */
9461 &PyBaseObject_Type, /* tp_base */
9462 0, /* tp_dict */
9463 0, /* tp_descr_get */
9464 0, /* tp_descr_set */
9465 0, /* tp_dictoffset */
9466 0, /* tp_init */
9467 0, /* tp_alloc */
9468 unicode_new, /* tp_new */
9469 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470};
9471
9472/* Initialize the Unicode implementation */
9473
Thomas Wouters78890102000-07-22 19:25:51 +00009474void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009476 int i;
9477
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 /* XXX - move this array to unicodectype.c ? */
9479 Py_UNICODE linebreak[] = {
9480 0x000A, /* LINE FEED */
9481 0x000D, /* CARRIAGE RETURN */
9482 0x001C, /* FILE SEPARATOR */
9483 0x001D, /* GROUP SEPARATOR */
9484 0x001E, /* RECORD SEPARATOR */
9485 0x0085, /* NEXT LINE */
9486 0x2028, /* LINE SEPARATOR */
9487 0x2029, /* PARAGRAPH SEPARATOR */
9488 };
9489
Fred Drakee4315f52000-05-09 19:53:39 +00009490 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009491 free_list = NULL;
9492 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009494 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009496
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009497 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009499 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009501
9502 /* initialize the linebreak bloom filter */
9503 bloom_linebreak = make_bloom_mask(
9504 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9505 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009506
9507 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508}
9509
9510/* Finalize the Unicode implementation */
9511
Christian Heimesa156e092008-02-16 07:38:31 +00009512int
9513PyUnicode_ClearFreeList(void)
9514{
9515 int freelist_size = numfree;
9516 PyUnicodeObject *u;
9517
9518 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 PyUnicodeObject *v = u;
9520 u = *(PyUnicodeObject **)u;
9521 if (v->str)
9522 PyObject_DEL(v->str);
9523 Py_XDECREF(v->defenc);
9524 PyObject_Del(v);
9525 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009526 }
9527 free_list = NULL;
9528 assert(numfree == 0);
9529 return freelist_size;
9530}
9531
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532void
Thomas Wouters78890102000-07-22 19:25:51 +00009533_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009535 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009537 Py_XDECREF(unicode_empty);
9538 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009539
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009540 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 if (unicode_latin1[i]) {
9542 Py_DECREF(unicode_latin1[i]);
9543 unicode_latin1[i] = NULL;
9544 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009545 }
Christian Heimesa156e092008-02-16 07:38:31 +00009546 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009548
Walter Dörwald16807132007-05-25 13:52:07 +00009549void
9550PyUnicode_InternInPlace(PyObject **p)
9551{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009552 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9553 PyObject *t;
9554 if (s == NULL || !PyUnicode_Check(s))
9555 Py_FatalError(
9556 "PyUnicode_InternInPlace: unicode strings only please!");
9557 /* If it's a subclass, we don't really know what putting
9558 it in the interned dict might do. */
9559 if (!PyUnicode_CheckExact(s))
9560 return;
9561 if (PyUnicode_CHECK_INTERNED(s))
9562 return;
9563 if (interned == NULL) {
9564 interned = PyDict_New();
9565 if (interned == NULL) {
9566 PyErr_Clear(); /* Don't leave an exception */
9567 return;
9568 }
9569 }
9570 /* It might be that the GetItem call fails even
9571 though the key is present in the dictionary,
9572 namely when this happens during a stack overflow. */
9573 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009576
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 if (t) {
9578 Py_INCREF(t);
9579 Py_DECREF(*p);
9580 *p = t;
9581 return;
9582 }
Walter Dörwald16807132007-05-25 13:52:07 +00009583
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 PyThreadState_GET()->recursion_critical = 1;
9585 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9586 PyErr_Clear();
9587 PyThreadState_GET()->recursion_critical = 0;
9588 return;
9589 }
9590 PyThreadState_GET()->recursion_critical = 0;
9591 /* The two references in interned are not counted by refcnt.
9592 The deallocator will take care of this */
9593 Py_REFCNT(s) -= 2;
9594 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009595}
9596
9597void
9598PyUnicode_InternImmortal(PyObject **p)
9599{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 PyUnicode_InternInPlace(p);
9601 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9602 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9603 Py_INCREF(*p);
9604 }
Walter Dörwald16807132007-05-25 13:52:07 +00009605}
9606
9607PyObject *
9608PyUnicode_InternFromString(const char *cp)
9609{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 PyObject *s = PyUnicode_FromString(cp);
9611 if (s == NULL)
9612 return NULL;
9613 PyUnicode_InternInPlace(&s);
9614 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009615}
9616
9617void _Py_ReleaseInternedUnicodeStrings(void)
9618{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 PyObject *keys;
9620 PyUnicodeObject *s;
9621 Py_ssize_t i, n;
9622 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009623
Benjamin Peterson14339b62009-01-31 16:36:08 +00009624 if (interned == NULL || !PyDict_Check(interned))
9625 return;
9626 keys = PyDict_Keys(interned);
9627 if (keys == NULL || !PyList_Check(keys)) {
9628 PyErr_Clear();
9629 return;
9630 }
Walter Dörwald16807132007-05-25 13:52:07 +00009631
Benjamin Peterson14339b62009-01-31 16:36:08 +00009632 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9633 detector, interned unicode strings are not forcibly deallocated;
9634 rather, we give them their stolen references back, and then clear
9635 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009636
Benjamin Peterson14339b62009-01-31 16:36:08 +00009637 n = PyList_GET_SIZE(keys);
9638 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 for (i = 0; i < n; i++) {
9641 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9642 switch (s->state) {
9643 case SSTATE_NOT_INTERNED:
9644 /* XXX Shouldn't happen */
9645 break;
9646 case SSTATE_INTERNED_IMMORTAL:
9647 Py_REFCNT(s) += 1;
9648 immortal_size += s->length;
9649 break;
9650 case SSTATE_INTERNED_MORTAL:
9651 Py_REFCNT(s) += 2;
9652 mortal_size += s->length;
9653 break;
9654 default:
9655 Py_FatalError("Inconsistent interned string state.");
9656 }
9657 s->state = SSTATE_NOT_INTERNED;
9658 }
9659 fprintf(stderr, "total size of all interned strings: "
9660 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9661 "mortal/immortal\n", mortal_size, immortal_size);
9662 Py_DECREF(keys);
9663 PyDict_Clear(interned);
9664 Py_DECREF(interned);
9665 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009666}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009667
9668
9669/********************* Unicode Iterator **************************/
9670
9671typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009672 PyObject_HEAD
9673 Py_ssize_t it_index;
9674 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009675} unicodeiterobject;
9676
9677static void
9678unicodeiter_dealloc(unicodeiterobject *it)
9679{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009680 _PyObject_GC_UNTRACK(it);
9681 Py_XDECREF(it->it_seq);
9682 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009683}
9684
9685static int
9686unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009688 Py_VISIT(it->it_seq);
9689 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009690}
9691
9692static PyObject *
9693unicodeiter_next(unicodeiterobject *it)
9694{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009695 PyUnicodeObject *seq;
9696 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009697
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 assert(it != NULL);
9699 seq = it->it_seq;
9700 if (seq == NULL)
9701 return NULL;
9702 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009703
Benjamin Peterson14339b62009-01-31 16:36:08 +00009704 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9705 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 if (item != NULL)
9708 ++it->it_index;
9709 return item;
9710 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009711
Benjamin Peterson14339b62009-01-31 16:36:08 +00009712 Py_DECREF(seq);
9713 it->it_seq = NULL;
9714 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009715}
9716
9717static PyObject *
9718unicodeiter_len(unicodeiterobject *it)
9719{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009720 Py_ssize_t len = 0;
9721 if (it->it_seq)
9722 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9723 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724}
9725
9726PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9727
9728static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009729 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009731 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009732};
9733
9734PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9736 "str_iterator", /* tp_name */
9737 sizeof(unicodeiterobject), /* tp_basicsize */
9738 0, /* tp_itemsize */
9739 /* methods */
9740 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9741 0, /* tp_print */
9742 0, /* tp_getattr */
9743 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009744 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 0, /* tp_repr */
9746 0, /* tp_as_number */
9747 0, /* tp_as_sequence */
9748 0, /* tp_as_mapping */
9749 0, /* tp_hash */
9750 0, /* tp_call */
9751 0, /* tp_str */
9752 PyObject_GenericGetAttr, /* tp_getattro */
9753 0, /* tp_setattro */
9754 0, /* tp_as_buffer */
9755 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9756 0, /* tp_doc */
9757 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9758 0, /* tp_clear */
9759 0, /* tp_richcompare */
9760 0, /* tp_weaklistoffset */
9761 PyObject_SelfIter, /* tp_iter */
9762 (iternextfunc)unicodeiter_next, /* tp_iternext */
9763 unicodeiter_methods, /* tp_methods */
9764 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009765};
9766
9767static PyObject *
9768unicode_iter(PyObject *seq)
9769{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009771
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 if (!PyUnicode_Check(seq)) {
9773 PyErr_BadInternalCall();
9774 return NULL;
9775 }
9776 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9777 if (it == NULL)
9778 return NULL;
9779 it->it_index = 0;
9780 Py_INCREF(seq);
9781 it->it_seq = (PyUnicodeObject *)seq;
9782 _PyObject_GC_TRACK(it);
9783 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009784}
9785
Martin v. Löwis5b222132007-06-10 09:51:05 +00009786size_t
9787Py_UNICODE_strlen(const Py_UNICODE *u)
9788{
9789 int res = 0;
9790 while(*u++)
9791 res++;
9792 return res;
9793}
9794
9795Py_UNICODE*
9796Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9797{
9798 Py_UNICODE *u = s1;
9799 while ((*u++ = *s2++));
9800 return s1;
9801}
9802
9803Py_UNICODE*
9804Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9805{
9806 Py_UNICODE *u = s1;
9807 while ((*u++ = *s2++))
9808 if (n-- == 0)
9809 break;
9810 return s1;
9811}
9812
9813int
9814Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9815{
9816 while (*s1 && *s2 && *s1 == *s2)
9817 s1++, s2++;
9818 if (*s1 && *s2)
9819 return (*s1 < *s2) ? -1 : +1;
9820 if (*s1)
9821 return 1;
9822 if (*s2)
9823 return -1;
9824 return 0;
9825}
9826
9827Py_UNICODE*
9828Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9829{
9830 const Py_UNICODE *p;
9831 for (p = s; *p; p++)
9832 if (*p == c)
9833 return (Py_UNICODE*)p;
9834 return NULL;
9835}
9836
9837
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009838#ifdef __cplusplus
9839}
9840#endif