blob: 307027a8119b4f85f6de62d9705d147fe414acd1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 Py_ssize_t size,
1298 const char *encoding,
1299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001302 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 char lower[20]; /* Enough for any encoding name we recognize */
1304 char *l;
1305 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
1307 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 encoding = PyUnicode_GetDefaultEncoding();
1309
1310 /* Convert encoding to lower case and replace '_' with '-' in order to
1311 catch e.g. UTF_8 */
1312 e = encoding;
1313 l = lower;
1314 while (*e && l < &lower[(sizeof lower) - 2]) {
1315 if (ISUPPER(*e)) {
1316 *l++ = TOLOWER(*e++);
1317 }
1318 else if (*e == '_') {
1319 *l++ = '-';
1320 e++;
1321 }
1322 else {
1323 *l++ = *e++;
1324 }
1325 }
1326 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001327
1328 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001331 else if ((strcmp(lower, "latin-1") == 0) ||
1332 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001334#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001335 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336 return PyUnicode_DecodeMBCS(s, size, errors);
1337#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001339 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001340 else if (strcmp(lower, "utf-16") == 0)
1341 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1342 else if (strcmp(lower, "utf-32") == 0)
1343 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001346 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001347 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001348 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001349 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 if (buffer == NULL)
1351 goto onError;
1352 unicode = PyCodec_Decode(buffer, encoding, errors);
1353 if (unicode == NULL)
1354 goto onError;
1355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001357 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001358 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 Py_DECREF(unicode);
1360 goto onError;
1361 }
1362 Py_DECREF(buffer);
1363 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 Py_XDECREF(buffer);
1367 return NULL;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Decode via the codec registry */
1385 v = PyCodec_Decode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
1399
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404
1405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407
1408 /* Decode via the codec registry */
1409 v = PyCodec_Decode(unicode, encoding, errors);
1410 if (v == NULL)
1411 goto onError;
1412 if (!PyUnicode_Check(v)) {
1413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001414 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001415 Py_TYPE(v)->tp_name);
1416 Py_DECREF(v);
1417 goto onError;
1418 }
1419 return v;
1420
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422 return NULL;
1423}
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 Py_ssize_t size,
1427 const char *encoding,
1428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 unicode = PyUnicode_FromUnicode(s, size);
1433 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1436 Py_DECREF(unicode);
1437 return v;
1438}
1439
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001440PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1441 const char *encoding,
1442 const char *errors)
1443{
1444 PyObject *v;
1445
1446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 goto onError;
1449 }
1450
1451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453
1454 /* Encode via the codec registry */
1455 v = PyCodec_Encode(unicode, encoding, errors);
1456 if (v == NULL)
1457 goto onError;
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001461 return NULL;
1462}
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1465 const char *encoding,
1466 const char *errors)
1467{
1468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 }
Fred Drakee4315f52000-05-09 19:53:39 +00001474
Tim Petersced69f82003-09-16 20:30:58 +00001475 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001476 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001477
1478 /* Shortcuts for common default encodings */
Victor Stinner59e62db2010-05-15 13:14:32 +00001479 if (strcmp(encoding, "utf-8") == 0)
1480 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1481 PyUnicode_GET_SIZE(unicode),
1482 errors);
1483 else if (strcmp(encoding, "latin-1") == 0)
1484 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001487#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner59e62db2010-05-15 13:14:32 +00001488 else if (strcmp(encoding, "mbcs") == 0)
1489 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1490 PyUnicode_GET_SIZE(unicode),
1491 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001492#endif
Victor Stinner59e62db2010-05-15 13:14:32 +00001493 else if (strcmp(encoding, "ascii") == 0)
1494 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1495 PyUnicode_GET_SIZE(unicode),
1496 errors);
1497 /* During bootstrap, we may need to find the encodings
1498 package, to load the file system encoding, and require the
1499 file system encoding in order to load the encodings
1500 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001501
Victor Stinner59e62db2010-05-15 13:14:32 +00001502 Break out of this dependency by assuming that the path to
1503 the encodings module is ASCII-only. XXX could try wcstombs
1504 instead, if the file system encoding is the locale's
1505 encoding. */
1506 else if (Py_FileSystemDefaultEncoding &&
1507 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1508 !PyThreadState_GET()->interp->codecs_initialized)
1509 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512
1513 /* Encode via the codec registry */
1514 v = PyCodec_Encode(unicode, encoding, errors);
1515 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001516 return NULL;
1517
1518 /* The normal path */
1519 if (PyBytes_Check(v))
1520 return v;
1521
1522 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001523 if (PyByteArray_Check(v)) {
1524 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001525 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001526 PyOS_snprintf(msg, sizeof(msg),
1527 "encoder %s returned buffer instead of bytes",
1528 encoding);
1529 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001530 Py_DECREF(v);
1531 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001532 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001533
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001534 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1535 Py_DECREF(v);
1536 return b;
1537 }
1538
1539 PyErr_Format(PyExc_TypeError,
1540 "encoder did not return a bytes object (type=%.400s)",
1541 Py_TYPE(v)->tp_name);
1542 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001543 return NULL;
1544}
1545
1546PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1547 const char *encoding,
1548 const char *errors)
1549{
1550 PyObject *v;
1551
1552 if (!PyUnicode_Check(unicode)) {
1553 PyErr_BadArgument();
1554 goto onError;
1555 }
1556
1557 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001558 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559
1560 /* Encode via the codec registry */
1561 v = PyCodec_Encode(unicode, encoding, errors);
1562 if (v == NULL)
1563 goto onError;
1564 if (!PyUnicode_Check(v)) {
1565 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001566 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001567 Py_TYPE(v)->tp_name);
1568 Py_DECREF(v);
1569 goto onError;
1570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001572
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 return NULL;
1575}
1576
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001577PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001579{
1580 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001581 if (v)
1582 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001583 if (errors != NULL)
1584 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001585 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001586 PyUnicode_GET_SIZE(unicode),
1587 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001588 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001589 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001590 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001591 return v;
1592}
1593
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001594PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001595PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001596 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001597 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1598}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001599
Christian Heimes5894ba72007-11-04 11:43:14 +00001600PyObject*
1601PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1602{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001603 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1604 can be undefined. If it is case, decode using UTF-8. The following assumes
1605 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1606 bootstrapping process where the codecs aren't ready yet.
1607 */
1608 if (Py_FileSystemDefaultEncoding) {
1609#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001610 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001611 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001612 }
1613#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001614 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001615 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001616 }
1617#endif
1618 return PyUnicode_Decode(s, size,
1619 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001620 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621 }
1622 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001623 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001624 }
1625}
1626
Martin v. Löwis011e8422009-05-05 04:43:17 +00001627/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001628 system encoding. The addr param must be a PyObject**.
1629 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001630
1631int
1632PyUnicode_FSConverter(PyObject* arg, void* addr)
1633{
1634 PyObject *output = NULL;
1635 Py_ssize_t size;
1636 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001637 if (arg == NULL) {
1638 Py_DECREF(*(PyObject**)addr);
1639 return 1;
1640 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001641 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001642 output = arg;
1643 Py_INCREF(output);
1644 }
1645 else {
1646 arg = PyUnicode_FromObject(arg);
1647 if (!arg)
1648 return 0;
Victor Stinner0ea2a462010-04-30 00:22:08 +00001649 output = PyUnicode_AsEncodedObject(arg,
Martin v. Löwis011e8422009-05-05 04:43:17 +00001650 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001651 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001652 Py_DECREF(arg);
1653 if (!output)
1654 return 0;
1655 if (!PyBytes_Check(output)) {
1656 Py_DECREF(output);
1657 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1658 return 0;
1659 }
1660 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001661 size = PyBytes_GET_SIZE(output);
1662 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001663 if (size != strlen(data)) {
1664 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1665 Py_DECREF(output);
1666 return 0;
1667 }
1668 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001669 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001670}
1671
1672
Martin v. Löwis5b222132007-06-10 09:51:05 +00001673char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001674_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001675{
Christian Heimesf3863112007-11-22 07:46:41 +00001676 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001677 if (!PyUnicode_Check(unicode)) {
1678 PyErr_BadArgument();
1679 return NULL;
1680 }
Christian Heimesf3863112007-11-22 07:46:41 +00001681 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1682 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001683 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001684 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001685 *psize = PyBytes_GET_SIZE(bytes);
1686 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001687}
1688
1689char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001690_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001691{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001692 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001693}
1694
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1696{
1697 if (!PyUnicode_Check(unicode)) {
1698 PyErr_BadArgument();
1699 goto onError;
1700 }
1701 return PyUnicode_AS_UNICODE(unicode);
1702
Benjamin Peterson29060642009-01-31 22:14:21 +00001703 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 return NULL;
1705}
1706
Martin v. Löwis18e16552006-02-15 17:27:45 +00001707Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708{
1709 if (!PyUnicode_Check(unicode)) {
1710 PyErr_BadArgument();
1711 goto onError;
1712 }
1713 return PyUnicode_GET_SIZE(unicode);
1714
Benjamin Peterson29060642009-01-31 22:14:21 +00001715 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 return -1;
1717}
1718
Thomas Wouters78890102000-07-22 19:25:51 +00001719const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001720{
1721 return unicode_default_encoding;
1722}
1723
1724int PyUnicode_SetDefaultEncoding(const char *encoding)
1725{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001726 if (strcmp(encoding, unicode_default_encoding) != 0) {
1727 PyErr_Format(PyExc_ValueError,
1728 "Can only set default encoding to %s",
1729 unicode_default_encoding);
1730 return -1;
1731 }
Fred Drakee4315f52000-05-09 19:53:39 +00001732 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001733}
1734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735/* error handling callback helper:
1736 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001737 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 and adjust various state variables.
1739 return 0 on success, -1 on error
1740*/
1741
1742static
1743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 const char *encoding, const char *reason,
1745 const char **input, const char **inend, Py_ssize_t *startinpos,
1746 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1747 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001749 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750
1751 PyObject *restuple = NULL;
1752 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001754 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001755 Py_ssize_t requiredsize;
1756 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001758 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001759 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 int res = -1;
1761
1762 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001763 *errorHandler = PyCodec_LookupError(errors);
1764 if (*errorHandler == NULL)
1765 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 }
1767
1768 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1771 if (*exceptionObject == NULL)
1772 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 }
1774 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1776 goto onError;
1777 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1778 goto onError;
1779 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 }
1782
1783 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1784 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001787 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 }
1790 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792
1793 /* Copy back the bytes variables, which might have been modified by the
1794 callback */
1795 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1796 if (!inputobj)
1797 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001798 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001800 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001801 *input = PyBytes_AS_STRING(inputobj);
1802 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001803 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001804 /* we can DECREF safely, as the exception has another reference,
1805 so the object won't go away. */
1806 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001810 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1812 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001813 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814
1815 /* need more space? (at least enough for what we
1816 have+the replacement+the rest of the string (starting
1817 at the new input position), so we won't have to check space
1818 when there are no errors in the rest of the string) */
1819 repptr = PyUnicode_AS_UNICODE(repunicode);
1820 repsize = PyUnicode_GET_SIZE(repunicode);
1821 requiredsize = *outpos + repsize + insize-newpos;
1822 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 if (requiredsize<2*outsize)
1824 requiredsize = 2*outsize;
1825 if (_PyUnicode_Resize(output, requiredsize) < 0)
1826 goto onError;
1827 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 }
1829 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001830 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 Py_UNICODE_COPY(*outptr, repptr, repsize);
1832 *outptr += repsize;
1833 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 /* we made it! */
1836 res = 0;
1837
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 Py_XDECREF(restuple);
1840 return res;
1841}
1842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843/* --- UTF-7 Codec -------------------------------------------------------- */
1844
Antoine Pitrou244651a2009-05-04 18:56:13 +00001845/* See RFC2152 for details. We encode conservatively and decode liberally. */
1846
1847/* Three simple macros defining base-64. */
1848
1849/* Is c a base-64 character? */
1850
1851#define IS_BASE64(c) \
1852 (((c) >= 'A' && (c) <= 'Z') || \
1853 ((c) >= 'a' && (c) <= 'z') || \
1854 ((c) >= '0' && (c) <= '9') || \
1855 (c) == '+' || (c) == '/')
1856
1857/* given that c is a base-64 character, what is its base-64 value? */
1858
1859#define FROM_BASE64(c) \
1860 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1861 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1862 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1863 (c) == '+' ? 62 : 63)
1864
1865/* What is the base-64 character of the bottom 6 bits of n? */
1866
1867#define TO_BASE64(n) \
1868 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1869
1870/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1871 * decoded as itself. We are permissive on decoding; the only ASCII
1872 * byte not decoding to itself is the + which begins a base64
1873 * string. */
1874
1875#define DECODE_DIRECT(c) \
1876 ((c) <= 127 && (c) != '+')
1877
1878/* The UTF-7 encoder treats ASCII characters differently according to
1879 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1880 * the above). See RFC2152. This array identifies these different
1881 * sets:
1882 * 0 : "Set D"
1883 * alphanumeric and '(),-./:?
1884 * 1 : "Set O"
1885 * !"#$%&*;<=>@[]^_`{|}
1886 * 2 : "whitespace"
1887 * ht nl cr sp
1888 * 3 : special (must be base64 encoded)
1889 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1890 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001891
Tim Petersced69f82003-09-16 20:30:58 +00001892static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001893char utf7_category[128] = {
1894/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1895 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1896/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1897 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1898/* sp ! " # $ % & ' ( ) * + , - . / */
1899 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1900/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1902/* @ A B C D E F G H I J K L M N O */
1903 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1904/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1905 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1906/* ` a b c d e f g h i j k l m n o */
1907 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1908/* p q r s t u v w x y z { | } ~ del */
1909 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001910};
1911
Antoine Pitrou244651a2009-05-04 18:56:13 +00001912/* ENCODE_DIRECT: this character should be encoded as itself. The
1913 * answer depends on whether we are encoding set O as itself, and also
1914 * on whether we are encoding whitespace as itself. RFC2152 makes it
1915 * clear that the answers to these questions vary between
1916 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001917
Antoine Pitrou244651a2009-05-04 18:56:13 +00001918#define ENCODE_DIRECT(c, directO, directWS) \
1919 ((c) < 128 && (c) > 0 && \
1920 ((utf7_category[(c)] == 0) || \
1921 (directWS && (utf7_category[(c)] == 2)) || \
1922 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001923
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001924PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001925 Py_ssize_t size,
1926 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001927{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001928 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1929}
1930
Antoine Pitrou244651a2009-05-04 18:56:13 +00001931/* The decoder. The only state we preserve is our read position,
1932 * i.e. how many characters we have consumed. So if we end in the
1933 * middle of a shift sequence we have to back off the read position
1934 * and the output to the beginning of the sequence, otherwise we lose
1935 * all the shift state (seen bits, number of bits seen, high
1936 * surrogate). */
1937
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001938PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001939 Py_ssize_t size,
1940 const char *errors,
1941 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001943 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001944 Py_ssize_t startinpos;
1945 Py_ssize_t endinpos;
1946 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001947 const char *e;
1948 PyUnicodeObject *unicode;
1949 Py_UNICODE *p;
1950 const char *errmsg = "";
1951 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001952 Py_UNICODE *shiftOutStart;
1953 unsigned int base64bits = 0;
1954 unsigned long base64buffer = 0;
1955 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 PyObject *errorHandler = NULL;
1957 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001958
1959 unicode = _PyUnicode_New(size);
1960 if (!unicode)
1961 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001962 if (size == 0) {
1963 if (consumed)
1964 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001966 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001967
1968 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001969 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001970 e = s + size;
1971
1972 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001975 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001976
Antoine Pitrou244651a2009-05-04 18:56:13 +00001977 if (inShift) { /* in a base-64 section */
1978 if (IS_BASE64(ch)) { /* consume a base-64 character */
1979 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1980 base64bits += 6;
1981 s++;
1982 if (base64bits >= 16) {
1983 /* we have enough bits for a UTF-16 value */
1984 Py_UNICODE outCh = (Py_UNICODE)
1985 (base64buffer >> (base64bits-16));
1986 base64bits -= 16;
1987 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1988 if (surrogate) {
1989 /* expecting a second surrogate */
1990 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1991#ifdef Py_UNICODE_WIDE
1992 *p++ = (((surrogate & 0x3FF)<<10)
1993 | (outCh & 0x3FF)) + 0x10000;
1994#else
1995 *p++ = surrogate;
1996 *p++ = outCh;
1997#endif
1998 surrogate = 0;
1999 }
2000 else {
2001 surrogate = 0;
2002 errmsg = "second surrogate missing";
2003 goto utf7Error;
2004 }
2005 }
2006 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2007 /* first surrogate */
2008 surrogate = outCh;
2009 }
2010 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2011 errmsg = "unexpected second surrogate";
2012 goto utf7Error;
2013 }
2014 else {
2015 *p++ = outCh;
2016 }
2017 }
2018 }
2019 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002020 inShift = 0;
2021 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002022 if (surrogate) {
2023 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002024 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002026 if (base64bits > 0) { /* left-over bits */
2027 if (base64bits >= 6) {
2028 /* We've seen at least one base-64 character */
2029 errmsg = "partial character in shift sequence";
2030 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002031 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 else {
2033 /* Some bits remain; they should be zero */
2034 if (base64buffer != 0) {
2035 errmsg = "non-zero padding bits in shift sequence";
2036 goto utf7Error;
2037 }
2038 }
2039 }
2040 if (ch != '-') {
2041 /* '-' is absorbed; other terminating
2042 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 *p++ = ch;
2044 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002045 }
2046 }
2047 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002049 s++; /* consume '+' */
2050 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002051 s++;
2052 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002053 }
2054 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002056 shiftOutStart = p;
2057 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 }
2059 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002060 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002061 *p++ = ch;
2062 s++;
2063 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002064 else {
2065 startinpos = s-starts;
2066 s++;
2067 errmsg = "unexpected special character";
2068 goto utf7Error;
2069 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002070 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002071utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 outpos = p-PyUnicode_AS_UNICODE(unicode);
2073 endinpos = s-starts;
2074 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002075 errors, &errorHandler,
2076 "utf7", errmsg,
2077 &starts, &e, &startinpos, &endinpos, &exc, &s,
2078 &unicode, &outpos, &p))
2079 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 }
2081
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 /* end of string */
2083
2084 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2085 /* if we're in an inconsistent state, that's an error */
2086 if (surrogate ||
2087 (base64bits >= 6) ||
2088 (base64bits > 0 && base64buffer != 0)) {
2089 outpos = p-PyUnicode_AS_UNICODE(unicode);
2090 endinpos = size;
2091 if (unicode_decode_call_errorhandler(
2092 errors, &errorHandler,
2093 "utf7", "unterminated shift sequence",
2094 &starts, &e, &startinpos, &endinpos, &exc, &s,
2095 &unicode, &outpos, &p))
2096 goto onError;
2097 if (s < e)
2098 goto restart;
2099 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002100 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002101
2102 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002103 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002104 if (inShift) {
2105 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002106 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002107 }
2108 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002109 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002110 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002111 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002113 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114 goto onError;
2115
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002116 Py_XDECREF(errorHandler);
2117 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002118 return (PyObject *)unicode;
2119
Benjamin Peterson29060642009-01-31 22:14:21 +00002120 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 Py_XDECREF(errorHandler);
2122 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002123 Py_DECREF(unicode);
2124 return NULL;
2125}
2126
2127
2128PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002129 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130 int base64SetO,
2131 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002132 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002134 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002136 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002137 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002138 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 unsigned int base64bits = 0;
2140 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141 char * out;
2142 char * start;
2143
2144 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002145 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002147 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002148 return PyErr_NoMemory();
2149
Antoine Pitrou244651a2009-05-04 18:56:13 +00002150 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002151 if (v == NULL)
2152 return NULL;
2153
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002154 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002155 for (;i < size; ++i) {
2156 Py_UNICODE ch = s[i];
2157
Antoine Pitrou244651a2009-05-04 18:56:13 +00002158 if (inShift) {
2159 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2160 /* shifting out */
2161 if (base64bits) { /* output remaining bits */
2162 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2163 base64buffer = 0;
2164 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002165 }
2166 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002167 /* Characters not in the BASE64 set implicitly unshift the sequence
2168 so no '-' is required, except if the character is itself a '-' */
2169 if (IS_BASE64(ch) || ch == '-') {
2170 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002171 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002172 *out++ = (char) ch;
2173 }
2174 else {
2175 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002176 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002177 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002178 else { /* not in a shift sequence */
2179 if (ch == '+') {
2180 *out++ = '+';
2181 *out++ = '-';
2182 }
2183 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2184 *out++ = (char) ch;
2185 }
2186 else {
2187 *out++ = '+';
2188 inShift = 1;
2189 goto encode_char;
2190 }
2191 }
2192 continue;
2193encode_char:
2194#ifdef Py_UNICODE_WIDE
2195 if (ch >= 0x10000) {
2196 /* code first surrogate */
2197 base64bits += 16;
2198 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2199 while (base64bits >= 6) {
2200 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2201 base64bits -= 6;
2202 }
2203 /* prepare second surrogate */
2204 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2205 }
2206#endif
2207 base64bits += 16;
2208 base64buffer = (base64buffer << 16) | ch;
2209 while (base64bits >= 6) {
2210 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2211 base64bits -= 6;
2212 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002213 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002214 if (base64bits)
2215 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2216 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002218 if (_PyBytes_Resize(&v, out - start) < 0)
2219 return NULL;
2220 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002221}
2222
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223#undef IS_BASE64
2224#undef FROM_BASE64
2225#undef TO_BASE64
2226#undef DECODE_DIRECT
2227#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002228
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229/* --- UTF-8 Codec -------------------------------------------------------- */
2230
Tim Petersced69f82003-09-16 20:30:58 +00002231static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232char utf8_code_length[256] = {
2233 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2234 illegal prefix. see RFC 2279 for details */
2235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2247 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2248 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2249 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2250 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2251};
2252
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002254 Py_ssize_t size,
2255 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256{
Walter Dörwald69652032004-09-07 20:24:22 +00002257 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2258}
2259
Antoine Pitrouab868312009-01-10 15:40:25 +00002260/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2261#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2262
2263/* Mask to quickly check whether a C 'long' contains a
2264 non-ASCII, UTF8-encoded char. */
2265#if (SIZEOF_LONG == 8)
2266# define ASCII_CHAR_MASK 0x8080808080808080L
2267#elif (SIZEOF_LONG == 4)
2268# define ASCII_CHAR_MASK 0x80808080L
2269#else
2270# error C 'long' size should be either 4 or 8!
2271#endif
2272
Walter Dörwald69652032004-09-07 20:24:22 +00002273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002274 Py_ssize_t size,
2275 const char *errors,
2276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002280 Py_ssize_t startinpos;
2281 Py_ssize_t endinpos;
2282 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002283 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 PyUnicodeObject *unicode;
2285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002287 PyObject *errorHandler = NULL;
2288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289
2290 /* Note: size will always be longer than the resulting Unicode
2291 character count */
2292 unicode = _PyUnicode_New(size);
2293 if (!unicode)
2294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002295 if (size == 0) {
2296 if (consumed)
2297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300
2301 /* Unpack UTF-8 encoded data */
2302 p = unicode->str;
2303 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002304 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305
2306 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002307 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
2309 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002310 /* Fast path for runs of ASCII characters. Given that common UTF-8
2311 input will consist of an overwhelming majority of ASCII
2312 characters, we try to optimize for this case by checking
2313 as many characters as a C 'long' can contain.
2314 First, check if we can do an aligned read, as most CPUs have
2315 a penalty for unaligned reads.
2316 */
2317 if (!((size_t) s & LONG_PTR_MASK)) {
2318 /* Help register allocation */
2319 register const char *_s = s;
2320 register Py_UNICODE *_p = p;
2321 while (_s < aligned_end) {
2322 /* Read a whole long at a time (either 4 or 8 bytes),
2323 and do a fast unrolled copy if it only contains ASCII
2324 characters. */
2325 unsigned long data = *(unsigned long *) _s;
2326 if (data & ASCII_CHAR_MASK)
2327 break;
2328 _p[0] = (unsigned char) _s[0];
2329 _p[1] = (unsigned char) _s[1];
2330 _p[2] = (unsigned char) _s[2];
2331 _p[3] = (unsigned char) _s[3];
2332#if (SIZEOF_LONG == 8)
2333 _p[4] = (unsigned char) _s[4];
2334 _p[5] = (unsigned char) _s[5];
2335 _p[6] = (unsigned char) _s[6];
2336 _p[7] = (unsigned char) _s[7];
2337#endif
2338 _s += SIZEOF_LONG;
2339 _p += SIZEOF_LONG;
2340 }
2341 s = _s;
2342 p = _p;
2343 if (s == e)
2344 break;
2345 ch = (unsigned char)*s;
2346 }
2347 }
2348
2349 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002350 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 s++;
2352 continue;
2353 }
2354
2355 n = utf8_code_length[ch];
2356
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002357 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 if (consumed)
2359 break;
2360 else {
2361 errmsg = "unexpected end of data";
2362 startinpos = s-starts;
2363 endinpos = size;
2364 goto utf8Error;
2365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
2368 switch (n) {
2369
2370 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002371 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 startinpos = s-starts;
2373 endinpos = startinpos+1;
2374 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375
2376 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002377 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002378 startinpos = s-starts;
2379 endinpos = startinpos+1;
2380 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002383 if ((s[1] & 0xc0) != 0x80) {
2384 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 startinpos = s-starts;
2386 endinpos = startinpos+2;
2387 goto utf8Error;
2388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002390 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 startinpos = s-starts;
2392 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002393 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002394 goto utf8Error;
2395 }
2396 else
2397 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 break;
2399
2400 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002401 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002402 (s[2] & 0xc0) != 0x80) {
2403 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002404 startinpos = s-starts;
2405 endinpos = startinpos+3;
2406 goto utf8Error;
2407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002409 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002410 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002411 startinpos = s-starts;
2412 endinpos = startinpos+3;
2413 goto utf8Error;
2414 }
2415 else
2416 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002417 break;
2418
2419 case 4:
2420 if ((s[1] & 0xc0) != 0x80 ||
2421 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002422 (s[3] & 0xc0) != 0x80) {
2423 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 startinpos = s-starts;
2425 endinpos = startinpos+4;
2426 goto utf8Error;
2427 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002428 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002430 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002431 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002433 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 UTF-16 */
2435 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002436 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 startinpos = s-starts;
2438 endinpos = startinpos+4;
2439 goto utf8Error;
2440 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002441#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002442 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002443#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002444 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002445
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002446 /* translate from 10000..10FFFF to 0..FFFF */
2447 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002448
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002449 /* high surrogate = top 10 bits added to D800 */
2450 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002451
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002452 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002453 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 break;
2456
2457 default:
2458 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002459 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002460 startinpos = s-starts;
2461 endinpos = startinpos+n;
2462 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 }
2464 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002466
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 utf8Error:
2468 outpos = p-PyUnicode_AS_UNICODE(unicode);
2469 if (unicode_decode_call_errorhandler(
2470 errors, &errorHandler,
2471 "utf8", errmsg,
2472 &starts, &e, &startinpos, &endinpos, &exc, &s,
2473 &unicode, &outpos, &p))
2474 goto onError;
2475 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
Walter Dörwald69652032004-09-07 20:24:22 +00002477 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002478 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479
2480 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002481 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 goto onError;
2483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_XDECREF(errorHandler);
2485 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 return (PyObject *)unicode;
2487
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 Py_XDECREF(errorHandler);
2490 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 Py_DECREF(unicode);
2492 return NULL;
2493}
2494
Antoine Pitrouab868312009-01-10 15:40:25 +00002495#undef ASCII_CHAR_MASK
2496
2497
Tim Peters602f7402002-04-27 18:03:26 +00002498/* Allocation strategy: if the string is short, convert into a stack buffer
2499 and allocate exactly as much space needed at the end. Else allocate the
2500 maximum possible needed (4 result bytes per Unicode character), and return
2501 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002502*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002503PyObject *
2504PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 Py_ssize_t size,
2506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507{
Tim Peters602f7402002-04-27 18:03:26 +00002508#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002509
Guido van Rossum98297ee2007-11-06 21:34:58 +00002510 Py_ssize_t i; /* index into s of next input byte */
2511 PyObject *result; /* result string object */
2512 char *p; /* next free byte in output buffer */
2513 Py_ssize_t nallocated; /* number of result bytes allocated */
2514 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002515 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002516 PyObject *errorHandler = NULL;
2517 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002518
Tim Peters602f7402002-04-27 18:03:26 +00002519 assert(s != NULL);
2520 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
Tim Peters602f7402002-04-27 18:03:26 +00002522 if (size <= MAX_SHORT_UNICHARS) {
2523 /* Write into the stack buffer; nallocated can't overflow.
2524 * At the end, we'll allocate exactly as much heap space as it
2525 * turns out we need.
2526 */
2527 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002528 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002529 p = stackbuf;
2530 }
2531 else {
2532 /* Overallocate on the heap, and give the excess back at the end. */
2533 nallocated = size * 4;
2534 if (nallocated / 4 != size) /* overflow! */
2535 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002536 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002537 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002538 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002539 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002540 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002541
Tim Peters602f7402002-04-27 18:03:26 +00002542 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002543 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002544
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002545 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002546 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002548
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002550 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002551 *p++ = (char)(0xc0 | (ch >> 6));
2552 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002553 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002554#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002555 /* Special case: check for high and low surrogate */
2556 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2557 Py_UCS4 ch2 = s[i];
2558 /* Combine the two surrogates to form a UCS4 value */
2559 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2560 i++;
2561
2562 /* Encode UCS4 Unicode ordinals */
2563 *p++ = (char)(0xf0 | (ch >> 18));
2564 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002565 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2566 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002567 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002568#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002569 Py_ssize_t newpos;
2570 PyObject *rep;
2571 Py_ssize_t repsize, k;
2572 rep = unicode_encode_call_errorhandler
2573 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2574 s, size, &exc, i-1, i, &newpos);
2575 if (!rep)
2576 goto error;
2577
2578 if (PyBytes_Check(rep))
2579 repsize = PyBytes_GET_SIZE(rep);
2580 else
2581 repsize = PyUnicode_GET_SIZE(rep);
2582
2583 if (repsize > 4) {
2584 Py_ssize_t offset;
2585
2586 if (result == NULL)
2587 offset = p - stackbuf;
2588 else
2589 offset = p - PyBytes_AS_STRING(result);
2590
2591 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2592 /* integer overflow */
2593 PyErr_NoMemory();
2594 goto error;
2595 }
2596 nallocated += repsize - 4;
2597 if (result != NULL) {
2598 if (_PyBytes_Resize(&result, nallocated) < 0)
2599 goto error;
2600 } else {
2601 result = PyBytes_FromStringAndSize(NULL, nallocated);
2602 if (result == NULL)
2603 goto error;
2604 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2605 }
2606 p = PyBytes_AS_STRING(result) + offset;
2607 }
2608
2609 if (PyBytes_Check(rep)) {
2610 char *prep = PyBytes_AS_STRING(rep);
2611 for(k = repsize; k > 0; k--)
2612 *p++ = *prep++;
2613 } else /* rep is unicode */ {
2614 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2615 Py_UNICODE c;
2616
2617 for(k=0; k<repsize; k++) {
2618 c = prep[k];
2619 if (0x80 <= c) {
2620 raise_encode_exception(&exc, "utf-8", s, size,
2621 i-1, i, "surrogates not allowed");
2622 goto error;
2623 }
2624 *p++ = (char)prep[k];
2625 }
2626 }
2627 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002628#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002629 }
Victor Stinner445a6232010-04-22 20:01:57 +00002630#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002631 } else if (ch < 0x10000) {
2632 *p++ = (char)(0xe0 | (ch >> 12));
2633 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2634 *p++ = (char)(0x80 | (ch & 0x3f));
2635 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002636 /* Encode UCS4 Unicode ordinals */
2637 *p++ = (char)(0xf0 | (ch >> 18));
2638 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2639 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2640 *p++ = (char)(0x80 | (ch & 0x3f));
2641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002643
Guido van Rossum98297ee2007-11-06 21:34:58 +00002644 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002645 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002646 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002647 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002648 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002649 }
2650 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002651 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002652 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002653 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002654 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002655 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002656 Py_XDECREF(errorHandler);
2657 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002658 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002659 error:
2660 Py_XDECREF(errorHandler);
2661 Py_XDECREF(exc);
2662 Py_XDECREF(result);
2663 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002664
Tim Peters602f7402002-04-27 18:03:26 +00002665#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666}
2667
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2669{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 if (!PyUnicode_Check(unicode)) {
2671 PyErr_BadArgument();
2672 return NULL;
2673 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002674 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 PyUnicode_GET_SIZE(unicode),
2676 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677}
2678
Walter Dörwald41980ca2007-08-16 21:55:45 +00002679/* --- UTF-32 Codec ------------------------------------------------------- */
2680
2681PyObject *
2682PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 Py_ssize_t size,
2684 const char *errors,
2685 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002686{
2687 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2688}
2689
2690PyObject *
2691PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 Py_ssize_t size,
2693 const char *errors,
2694 int *byteorder,
2695 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002696{
2697 const char *starts = s;
2698 Py_ssize_t startinpos;
2699 Py_ssize_t endinpos;
2700 Py_ssize_t outpos;
2701 PyUnicodeObject *unicode;
2702 Py_UNICODE *p;
2703#ifndef Py_UNICODE_WIDE
2704 int i, pairs;
2705#else
2706 const int pairs = 0;
2707#endif
2708 const unsigned char *q, *e;
2709 int bo = 0; /* assume native ordering by default */
2710 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002711 /* Offsets from q for retrieving bytes in the right order. */
2712#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2713 int iorder[] = {0, 1, 2, 3};
2714#else
2715 int iorder[] = {3, 2, 1, 0};
2716#endif
2717 PyObject *errorHandler = NULL;
2718 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002719 /* On narrow builds we split characters outside the BMP into two
2720 codepoints => count how much extra space we need. */
2721#ifndef Py_UNICODE_WIDE
2722 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 if (((Py_UCS4 *)s)[i] >= 0x10000)
2724 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002725#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002726
2727 /* This might be one to much, because of a BOM */
2728 unicode = _PyUnicode_New((size+3)/4+pairs);
2729 if (!unicode)
2730 return NULL;
2731 if (size == 0)
2732 return (PyObject *)unicode;
2733
2734 /* Unpack UTF-32 encoded data */
2735 p = unicode->str;
2736 q = (unsigned char *)s;
2737 e = q + size;
2738
2739 if (byteorder)
2740 bo = *byteorder;
2741
2742 /* Check for BOM marks (U+FEFF) in the input and adjust current
2743 byte order setting accordingly. In native mode, the leading BOM
2744 mark is skipped, in all other modes, it is copied to the output
2745 stream as-is (giving a ZWNBSP character). */
2746 if (bo == 0) {
2747 if (size >= 4) {
2748 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002750#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002751 if (bom == 0x0000FEFF) {
2752 q += 4;
2753 bo = -1;
2754 }
2755 else if (bom == 0xFFFE0000) {
2756 q += 4;
2757 bo = 1;
2758 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002759#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 if (bom == 0x0000FEFF) {
2761 q += 4;
2762 bo = 1;
2763 }
2764 else if (bom == 0xFFFE0000) {
2765 q += 4;
2766 bo = -1;
2767 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002768#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002770 }
2771
2772 if (bo == -1) {
2773 /* force LE */
2774 iorder[0] = 0;
2775 iorder[1] = 1;
2776 iorder[2] = 2;
2777 iorder[3] = 3;
2778 }
2779 else if (bo == 1) {
2780 /* force BE */
2781 iorder[0] = 3;
2782 iorder[1] = 2;
2783 iorder[2] = 1;
2784 iorder[3] = 0;
2785 }
2786
2787 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002788 Py_UCS4 ch;
2789 /* remaining bytes at the end? (size should be divisible by 4) */
2790 if (e-q<4) {
2791 if (consumed)
2792 break;
2793 errmsg = "truncated data";
2794 startinpos = ((const char *)q)-starts;
2795 endinpos = ((const char *)e)-starts;
2796 goto utf32Error;
2797 /* The remaining input chars are ignored if the callback
2798 chooses to skip the input */
2799 }
2800 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2801 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002802
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 if (ch >= 0x110000)
2804 {
2805 errmsg = "codepoint not in range(0x110000)";
2806 startinpos = ((const char *)q)-starts;
2807 endinpos = startinpos+4;
2808 goto utf32Error;
2809 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002810#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002811 if (ch >= 0x10000)
2812 {
2813 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2814 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2815 }
2816 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002817#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 *p++ = ch;
2819 q += 4;
2820 continue;
2821 utf32Error:
2822 outpos = p-PyUnicode_AS_UNICODE(unicode);
2823 if (unicode_decode_call_errorhandler(
2824 errors, &errorHandler,
2825 "utf32", errmsg,
2826 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2827 &unicode, &outpos, &p))
2828 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002829 }
2830
2831 if (byteorder)
2832 *byteorder = bo;
2833
2834 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002836
2837 /* Adjust length */
2838 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2839 goto onError;
2840
2841 Py_XDECREF(errorHandler);
2842 Py_XDECREF(exc);
2843 return (PyObject *)unicode;
2844
Benjamin Peterson29060642009-01-31 22:14:21 +00002845 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846 Py_DECREF(unicode);
2847 Py_XDECREF(errorHandler);
2848 Py_XDECREF(exc);
2849 return NULL;
2850}
2851
2852PyObject *
2853PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 Py_ssize_t size,
2855 const char *errors,
2856 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002857{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002858 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002859 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002860 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002861#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002862 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002863#else
2864 const int pairs = 0;
2865#endif
2866 /* Offsets from p for storing byte pairs in the right order. */
2867#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2868 int iorder[] = {0, 1, 2, 3};
2869#else
2870 int iorder[] = {3, 2, 1, 0};
2871#endif
2872
Benjamin Peterson29060642009-01-31 22:14:21 +00002873#define STORECHAR(CH) \
2874 do { \
2875 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2876 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2877 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2878 p[iorder[0]] = (CH) & 0xff; \
2879 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002880 } while(0)
2881
2882 /* In narrow builds we can output surrogate pairs as one codepoint,
2883 so we need less space. */
2884#ifndef Py_UNICODE_WIDE
2885 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2887 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2888 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002889#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002890 nsize = (size - pairs + (byteorder == 0));
2891 bytesize = nsize * 4;
2892 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002894 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895 if (v == NULL)
2896 return NULL;
2897
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002898 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002901 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002902 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002903
2904 if (byteorder == -1) {
2905 /* force LE */
2906 iorder[0] = 0;
2907 iorder[1] = 1;
2908 iorder[2] = 2;
2909 iorder[3] = 3;
2910 }
2911 else if (byteorder == 1) {
2912 /* force BE */
2913 iorder[0] = 3;
2914 iorder[1] = 2;
2915 iorder[2] = 1;
2916 iorder[3] = 0;
2917 }
2918
2919 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002921#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2923 Py_UCS4 ch2 = *s;
2924 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2925 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2926 s++;
2927 size--;
2928 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002930#endif
2931 STORECHAR(ch);
2932 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002933
2934 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002935 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002936#undef STORECHAR
2937}
2938
2939PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2940{
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_BadArgument();
2943 return NULL;
2944 }
2945 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 PyUnicode_GET_SIZE(unicode),
2947 NULL,
2948 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002949}
2950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951/* --- UTF-16 Codec ------------------------------------------------------- */
2952
Tim Peters772747b2001-08-09 22:21:55 +00002953PyObject *
2954PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 Py_ssize_t size,
2956 const char *errors,
2957 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958{
Walter Dörwald69652032004-09-07 20:24:22 +00002959 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2960}
2961
Antoine Pitrouab868312009-01-10 15:40:25 +00002962/* Two masks for fast checking of whether a C 'long' may contain
2963 UTF16-encoded surrogate characters. This is an efficient heuristic,
2964 assuming that non-surrogate characters with a code point >= 0x8000 are
2965 rare in most input.
2966 FAST_CHAR_MASK is used when the input is in native byte ordering,
2967 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002968*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002969#if (SIZEOF_LONG == 8)
2970# define FAST_CHAR_MASK 0x8000800080008000L
2971# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2972#elif (SIZEOF_LONG == 4)
2973# define FAST_CHAR_MASK 0x80008000L
2974# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2975#else
2976# error C 'long' size should be either 4 or 8!
2977#endif
2978
Walter Dörwald69652032004-09-07 20:24:22 +00002979PyObject *
2980PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 Py_ssize_t size,
2982 const char *errors,
2983 int *byteorder,
2984 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002987 Py_ssize_t startinpos;
2988 Py_ssize_t endinpos;
2989 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 PyUnicodeObject *unicode;
2991 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002992 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002993 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002994 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002995 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002996 /* Offsets from q for retrieving byte pairs in the right order. */
2997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2998 int ihi = 1, ilo = 0;
2999#else
3000 int ihi = 0, ilo = 1;
3001#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 PyObject *errorHandler = NULL;
3003 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004
3005 /* Note: size will always be longer than the resulting Unicode
3006 character count */
3007 unicode = _PyUnicode_New(size);
3008 if (!unicode)
3009 return NULL;
3010 if (size == 0)
3011 return (PyObject *)unicode;
3012
3013 /* Unpack UTF-16 encoded data */
3014 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003015 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003016 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017
3018 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003019 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003021 /* Check for BOM marks (U+FEFF) in the input and adjust current
3022 byte order setting accordingly. In native mode, the leading BOM
3023 mark is skipped, in all other modes, it is copied to the output
3024 stream as-is (giving a ZWNBSP character). */
3025 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003026 if (size >= 2) {
3027 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 if (bom == 0xFEFF) {
3030 q += 2;
3031 bo = -1;
3032 }
3033 else if (bom == 0xFFFE) {
3034 q += 2;
3035 bo = 1;
3036 }
Tim Petersced69f82003-09-16 20:30:58 +00003037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 if (bom == 0xFEFF) {
3039 q += 2;
3040 bo = 1;
3041 }
3042 else if (bom == 0xFFFE) {
3043 q += 2;
3044 bo = -1;
3045 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003046#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049
Tim Peters772747b2001-08-09 22:21:55 +00003050 if (bo == -1) {
3051 /* force LE */
3052 ihi = 1;
3053 ilo = 0;
3054 }
3055 else if (bo == 1) {
3056 /* force BE */
3057 ihi = 0;
3058 ilo = 1;
3059 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3061 native_ordering = ilo < ihi;
3062#else
3063 native_ordering = ilo > ihi;
3064#endif
Tim Peters772747b2001-08-09 22:21:55 +00003065
Antoine Pitrouab868312009-01-10 15:40:25 +00003066 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003067 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003069 /* First check for possible aligned read of a C 'long'. Unaligned
3070 reads are more expensive, better to defer to another iteration. */
3071 if (!((size_t) q & LONG_PTR_MASK)) {
3072 /* Fast path for runs of non-surrogate chars. */
3073 register const unsigned char *_q = q;
3074 Py_UNICODE *_p = p;
3075 if (native_ordering) {
3076 /* Native ordering is simple: as long as the input cannot
3077 possibly contain a surrogate char, do an unrolled copy
3078 of several 16-bit code points to the target object.
3079 The non-surrogate check is done on several input bytes
3080 at a time (as many as a C 'long' can contain). */
3081 while (_q < aligned_end) {
3082 unsigned long data = * (unsigned long *) _q;
3083 if (data & FAST_CHAR_MASK)
3084 break;
3085 _p[0] = ((unsigned short *) _q)[0];
3086 _p[1] = ((unsigned short *) _q)[1];
3087#if (SIZEOF_LONG == 8)
3088 _p[2] = ((unsigned short *) _q)[2];
3089 _p[3] = ((unsigned short *) _q)[3];
3090#endif
3091 _q += SIZEOF_LONG;
3092 _p += SIZEOF_LONG / 2;
3093 }
3094 }
3095 else {
3096 /* Byteswapped ordering is similar, but we must decompose
3097 the copy bytewise, and take care of zero'ing out the
3098 upper bytes if the target object is in 32-bit units
3099 (that is, in UCS-4 builds). */
3100 while (_q < aligned_end) {
3101 unsigned long data = * (unsigned long *) _q;
3102 if (data & SWAPPED_FAST_CHAR_MASK)
3103 break;
3104 /* Zero upper bytes in UCS-4 builds */
3105#if (Py_UNICODE_SIZE > 2)
3106 _p[0] = 0;
3107 _p[1] = 0;
3108#if (SIZEOF_LONG == 8)
3109 _p[2] = 0;
3110 _p[3] = 0;
3111#endif
3112#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003113 /* Issue #4916; UCS-4 builds on big endian machines must
3114 fill the two last bytes of each 4-byte unit. */
3115#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3116# define OFF 2
3117#else
3118# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003119#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003120 ((unsigned char *) _p)[OFF + 1] = _q[0];
3121 ((unsigned char *) _p)[OFF + 0] = _q[1];
3122 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3123 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3124#if (SIZEOF_LONG == 8)
3125 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3126 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3127 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3128 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3129#endif
3130#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003131 _q += SIZEOF_LONG;
3132 _p += SIZEOF_LONG / 2;
3133 }
3134 }
3135 p = _p;
3136 q = _q;
3137 if (q >= e)
3138 break;
3139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003140 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141
Benjamin Peterson14339b62009-01-31 16:36:08 +00003142 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003143
3144 if (ch < 0xD800 || ch > 0xDFFF) {
3145 *p++ = ch;
3146 continue;
3147 }
3148
3149 /* UTF-16 code pair: */
3150 if (q > e) {
3151 errmsg = "unexpected end of data";
3152 startinpos = (((const char *)q) - 2) - starts;
3153 endinpos = ((const char *)e) + 1 - starts;
3154 goto utf16Error;
3155 }
3156 if (0xD800 <= ch && ch <= 0xDBFF) {
3157 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3158 q += 2;
3159 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003160#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 *p++ = ch;
3162 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003163#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003165#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 continue;
3167 }
3168 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003169 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 startinpos = (((const char *)q)-4)-starts;
3171 endinpos = startinpos+2;
3172 goto utf16Error;
3173 }
3174
Benjamin Peterson14339b62009-01-31 16:36:08 +00003175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 errmsg = "illegal encoding";
3177 startinpos = (((const char *)q)-2)-starts;
3178 endinpos = startinpos+2;
3179 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003180
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 utf16Error:
3182 outpos = p - PyUnicode_AS_UNICODE(unicode);
3183 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003184 errors,
3185 &errorHandler,
3186 "utf16", errmsg,
3187 &starts,
3188 (const char **)&e,
3189 &startinpos,
3190 &endinpos,
3191 &exc,
3192 (const char **)&q,
3193 &unicode,
3194 &outpos,
3195 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003198 /* remaining byte at the end? (size should be even) */
3199 if (e == q) {
3200 if (!consumed) {
3201 errmsg = "truncated data";
3202 startinpos = ((const char *)q) - starts;
3203 endinpos = ((const char *)e) + 1 - starts;
3204 outpos = p - PyUnicode_AS_UNICODE(unicode);
3205 if (unicode_decode_call_errorhandler(
3206 errors,
3207 &errorHandler,
3208 "utf16", errmsg,
3209 &starts,
3210 (const char **)&e,
3211 &startinpos,
3212 &endinpos,
3213 &exc,
3214 (const char **)&q,
3215 &unicode,
3216 &outpos,
3217 &p))
3218 goto onError;
3219 /* The remaining input chars are ignored if the callback
3220 chooses to skip the input */
3221 }
3222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
3224 if (byteorder)
3225 *byteorder = bo;
3226
Walter Dörwald69652032004-09-07 20:24:22 +00003227 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003229
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003231 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 goto onError;
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 Py_XDECREF(errorHandler);
3235 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return (PyObject *)unicode;
3237
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return NULL;
3243}
3244
Antoine Pitrouab868312009-01-10 15:40:25 +00003245#undef FAST_CHAR_MASK
3246#undef SWAPPED_FAST_CHAR_MASK
3247
Tim Peters772747b2001-08-09 22:21:55 +00003248PyObject *
3249PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 Py_ssize_t size,
3251 const char *errors,
3252 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003254 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003255 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003256 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003257#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003258 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003259#else
3260 const int pairs = 0;
3261#endif
Tim Peters772747b2001-08-09 22:21:55 +00003262 /* Offsets from p for storing byte pairs in the right order. */
3263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3264 int ihi = 1, ilo = 0;
3265#else
3266 int ihi = 0, ilo = 1;
3267#endif
3268
Benjamin Peterson29060642009-01-31 22:14:21 +00003269#define STORECHAR(CH) \
3270 do { \
3271 p[ihi] = ((CH) >> 8) & 0xff; \
3272 p[ilo] = (CH) & 0xff; \
3273 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003274 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003276#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003277 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 if (s[i] >= 0x10000)
3279 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003280#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003281 /* 2 * (size + pairs + (byteorder == 0)) */
3282 if (size > PY_SSIZE_T_MAX ||
3283 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003285 nsize = size + pairs + (byteorder == 0);
3286 bytesize = nsize * 2;
3287 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003289 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 if (v == NULL)
3291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003293 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003296 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003297 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003298
3299 if (byteorder == -1) {
3300 /* force LE */
3301 ihi = 1;
3302 ilo = 0;
3303 }
3304 else if (byteorder == 1) {
3305 /* force BE */
3306 ihi = 0;
3307 ilo = 1;
3308 }
3309
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003310 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 Py_UNICODE ch = *s++;
3312 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003313#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 if (ch >= 0x10000) {
3315 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3316 ch = 0xD800 | ((ch-0x10000) >> 10);
3317 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003318#endif
Tim Peters772747b2001-08-09 22:21:55 +00003319 STORECHAR(ch);
3320 if (ch2)
3321 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003322 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003323
3324 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003325 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003326#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327}
3328
3329PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3330{
3331 if (!PyUnicode_Check(unicode)) {
3332 PyErr_BadArgument();
3333 return NULL;
3334 }
3335 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 PyUnicode_GET_SIZE(unicode),
3337 NULL,
3338 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339}
3340
3341/* --- Unicode Escape Codec ----------------------------------------------- */
3342
Fredrik Lundh06d12682001-01-24 07:59:11 +00003343static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003344
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 Py_ssize_t size,
3347 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003350 Py_ssize_t startinpos;
3351 Py_ssize_t endinpos;
3352 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003357 char* message;
3358 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 PyObject *errorHandler = NULL;
3360 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003361
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 /* Escaped strings will always be longer than the resulting
3363 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 length after conversion to the true value.
3365 (but if the error callback returns a long replacement string
3366 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 v = _PyUnicode_New(size);
3368 if (v == NULL)
3369 goto onError;
3370 if (size == 0)
3371 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003375
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 while (s < end) {
3377 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003378 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380
3381 /* Non-escape characters are interpreted as Unicode ordinals */
3382 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003383 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 continue;
3385 }
3386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 /* \ - Escapes */
3389 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003390 c = *s++;
3391 if (s > end)
3392 c = '\0'; /* Invalid after \ */
3393 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 case '\n': break;
3397 case '\\': *p++ = '\\'; break;
3398 case '\'': *p++ = '\''; break;
3399 case '\"': *p++ = '\"'; break;
3400 case 'b': *p++ = '\b'; break;
3401 case 'f': *p++ = '\014'; break; /* FF */
3402 case 't': *p++ = '\t'; break;
3403 case 'n': *p++ = '\n'; break;
3404 case 'r': *p++ = '\r'; break;
3405 case 'v': *p++ = '\013'; break; /* VT */
3406 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3407
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 case '0': case '1': case '2': case '3':
3410 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003411 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003412 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003413 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003414 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003415 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003417 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 break;
3419
Benjamin Peterson29060642009-01-31 22:14:21 +00003420 /* hex escapes */
3421 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003423 digits = 2;
3424 message = "truncated \\xXX escape";
3425 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003429 digits = 4;
3430 message = "truncated \\uXXXX escape";
3431 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003434 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003435 digits = 8;
3436 message = "truncated \\UXXXXXXXX escape";
3437 hexescape:
3438 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 outpos = p-PyUnicode_AS_UNICODE(v);
3440 if (s+digits>end) {
3441 endinpos = size;
3442 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 errors, &errorHandler,
3444 "unicodeescape", "end of string in escape sequence",
3445 &starts, &end, &startinpos, &endinpos, &exc, &s,
3446 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 goto onError;
3448 goto nextByte;
3449 }
3450 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003451 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003452 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 endinpos = (s+i+1)-starts;
3454 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 errors, &errorHandler,
3456 "unicodeescape", message,
3457 &starts, &end, &startinpos, &endinpos, &exc, &s,
3458 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003459 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003461 }
3462 chr = (chr<<4) & ~0xF;
3463 if (c >= '0' && c <= '9')
3464 chr += c - '0';
3465 else if (c >= 'a' && c <= 'f')
3466 chr += 10 + c - 'a';
3467 else
3468 chr += 10 + c - 'A';
3469 }
3470 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003471 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 /* _decoding_error will have already written into the
3473 target buffer. */
3474 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003475 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003476 /* when we get here, chr is a 32-bit unicode character */
3477 if (chr <= 0xffff)
3478 /* UCS-2 character */
3479 *p++ = (Py_UNICODE) chr;
3480 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003481 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003482 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003483#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003484 *p++ = chr;
3485#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003486 chr -= 0x10000L;
3487 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003488 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003489#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003490 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 endinpos = s-starts;
3492 outpos = p-PyUnicode_AS_UNICODE(v);
3493 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 errors, &errorHandler,
3495 "unicodeescape", "illegal Unicode character",
3496 &starts, &end, &startinpos, &endinpos, &exc, &s,
3497 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003498 goto onError;
3499 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003500 break;
3501
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003503 case 'N':
3504 message = "malformed \\N character escape";
3505 if (ucnhash_CAPI == NULL) {
3506 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003507 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003508 if (ucnhash_CAPI == NULL)
3509 goto ucnhashError;
3510 }
3511 if (*s == '{') {
3512 const char *start = s+1;
3513 /* look for the closing brace */
3514 while (*s != '}' && s < end)
3515 s++;
3516 if (s > start && s < end && *s == '}') {
3517 /* found a name. look it up in the unicode database */
3518 message = "unknown Unicode character name";
3519 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003520 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003521 goto store;
3522 }
3523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 endinpos = s-starts;
3525 outpos = p-PyUnicode_AS_UNICODE(v);
3526 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 errors, &errorHandler,
3528 "unicodeescape", message,
3529 &starts, &end, &startinpos, &endinpos, &exc, &s,
3530 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003531 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003532 break;
3533
3534 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003535 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 message = "\\ at end of string";
3537 s--;
3538 endinpos = s-starts;
3539 outpos = p-PyUnicode_AS_UNICODE(v);
3540 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 errors, &errorHandler,
3542 "unicodeescape", message,
3543 &starts, &end, &startinpos, &endinpos, &exc, &s,
3544 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003545 goto onError;
3546 }
3547 else {
3548 *p++ = '\\';
3549 *p++ = (unsigned char)s[-1];
3550 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003551 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003556 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003558 Py_XDECREF(errorHandler);
3559 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003561
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003563 PyErr_SetString(
3564 PyExc_UnicodeError,
3565 "\\N escapes not supported (can't load unicodedata module)"
3566 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003567 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_XDECREF(errorHandler);
3569 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003570 return NULL;
3571
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 return NULL;
3577}
3578
3579/* Return a Unicode-Escape string version of the Unicode object.
3580
3581 If quotes is true, the string is enclosed in u"" or u'' quotes as
3582 appropriate.
3583
3584*/
3585
Thomas Wouters477c8d52006-05-27 19:21:47 +00003586Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 Py_ssize_t size,
3588 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003589{
3590 /* like wcschr, but doesn't stop at NULL characters */
3591
3592 while (size-- > 0) {
3593 if (*s == ch)
3594 return s;
3595 s++;
3596 }
3597
3598 return NULL;
3599}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003600
Walter Dörwald79e913e2007-05-12 11:08:06 +00003601static const char *hexdigits = "0123456789abcdef";
3602
3603PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003604 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003606 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003609#ifdef Py_UNICODE_WIDE
3610 const Py_ssize_t expandsize = 10;
3611#else
3612 const Py_ssize_t expandsize = 6;
3613#endif
3614
Thomas Wouters89f507f2006-12-13 04:49:30 +00003615 /* XXX(nnorwitz): rather than over-allocating, it would be
3616 better to choose a different scheme. Perhaps scan the
3617 first N-chars of the string and allocate based on that size.
3618 */
3619 /* Initial allocation is based on the longest-possible unichr
3620 escape.
3621
3622 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3623 unichr, so in this case it's the longest unichr escape. In
3624 narrow (UTF-16) builds this is five chars per source unichr
3625 since there are two unichrs in the surrogate pair, so in narrow
3626 (UTF-16) builds it's not the longest unichr escape.
3627
3628 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3629 so in the narrow (UTF-16) build case it's the longest unichr
3630 escape.
3631 */
3632
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003633 if (size == 0)
3634 return PyBytes_FromStringAndSize(NULL, 0);
3635
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003636 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003638
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003639 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 2
3641 + expandsize*size
3642 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 if (repr == NULL)
3644 return NULL;
3645
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003646 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 while (size-- > 0) {
3649 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003650
Walter Dörwald79e913e2007-05-12 11:08:06 +00003651 /* Escape backslashes */
3652 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 *p++ = '\\';
3654 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003655 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003656 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003657
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003659 /* Map 21-bit characters to '\U00xxxxxx' */
3660 else if (ch >= 0x10000) {
3661 *p++ = '\\';
3662 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003663 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3664 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3665 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3666 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3667 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3668 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3669 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3670 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003672 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003673#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3675 else if (ch >= 0xD800 && ch < 0xDC00) {
3676 Py_UNICODE ch2;
3677 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 ch2 = *s++;
3680 size--;
3681 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3682 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3683 *p++ = '\\';
3684 *p++ = 'U';
3685 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3686 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3687 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3688 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3689 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3690 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3691 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3692 *p++ = hexdigits[ucs & 0x0000000F];
3693 continue;
3694 }
3695 /* Fall through: isolated surrogates are copied as-is */
3696 s--;
3697 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003698 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003699#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003702 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 *p++ = '\\';
3704 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003705 *p++ = hexdigits[(ch >> 12) & 0x000F];
3706 *p++ = hexdigits[(ch >> 8) & 0x000F];
3707 *p++ = hexdigits[(ch >> 4) & 0x000F];
3708 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003710
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003711 /* Map special whitespace to '\t', \n', '\r' */
3712 else if (ch == '\t') {
3713 *p++ = '\\';
3714 *p++ = 't';
3715 }
3716 else if (ch == '\n') {
3717 *p++ = '\\';
3718 *p++ = 'n';
3719 }
3720 else if (ch == '\r') {
3721 *p++ = '\\';
3722 *p++ = 'r';
3723 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003724
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003725 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003726 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003728 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003729 *p++ = hexdigits[(ch >> 4) & 0x000F];
3730 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003731 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 /* Copy everything else as-is */
3734 else
3735 *p++ = (char) ch;
3736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003738 assert(p - PyBytes_AS_STRING(repr) > 0);
3739 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3740 return NULL;
3741 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742}
3743
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003744PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003746 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 if (!PyUnicode_Check(unicode)) {
3748 PyErr_BadArgument();
3749 return NULL;
3750 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003751 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3752 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003753 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754}
3755
3756/* --- Raw Unicode Escape Codec ------------------------------------------- */
3757
3758PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 Py_ssize_t size,
3760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003763 Py_ssize_t startinpos;
3764 Py_ssize_t endinpos;
3765 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 const char *end;
3769 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 PyObject *errorHandler = NULL;
3771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 /* Escaped strings will always be longer than the resulting
3774 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 length after conversion to the true value. (But decoding error
3776 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 v = _PyUnicode_New(size);
3778 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 end = s + size;
3784 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 unsigned char c;
3786 Py_UCS4 x;
3787 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003788 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* Non-escape characters are interpreted as Unicode ordinals */
3791 if (*s != '\\') {
3792 *p++ = (unsigned char)*s++;
3793 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 startinpos = s-starts;
3796
3797 /* \u-escapes are only interpreted iff the number of leading
3798 backslashes if odd */
3799 bs = s;
3800 for (;s < end;) {
3801 if (*s != '\\')
3802 break;
3803 *p++ = (unsigned char)*s++;
3804 }
3805 if (((s - bs) & 1) == 0 ||
3806 s >= end ||
3807 (*s != 'u' && *s != 'U')) {
3808 continue;
3809 }
3810 p--;
3811 count = *s=='u' ? 4 : 8;
3812 s++;
3813
3814 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3815 outpos = p-PyUnicode_AS_UNICODE(v);
3816 for (x = 0, i = 0; i < count; ++i, ++s) {
3817 c = (unsigned char)*s;
3818 if (!ISXDIGIT(c)) {
3819 endinpos = s-starts;
3820 if (unicode_decode_call_errorhandler(
3821 errors, &errorHandler,
3822 "rawunicodeescape", "truncated \\uXXXX",
3823 &starts, &end, &startinpos, &endinpos, &exc, &s,
3824 &v, &outpos, &p))
3825 goto onError;
3826 goto nextByte;
3827 }
3828 x = (x<<4) & ~0xF;
3829 if (c >= '0' && c <= '9')
3830 x += c - '0';
3831 else if (c >= 'a' && c <= 'f')
3832 x += 10 + c - 'a';
3833 else
3834 x += 10 + c - 'A';
3835 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003836 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 /* UCS-2 character */
3838 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003839 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 /* UCS-4 character. Either store directly, or as
3841 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003842#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003844#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003845 x -= 0x10000L;
3846 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3847 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003848#endif
3849 } else {
3850 endinpos = s-starts;
3851 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003852 if (unicode_decode_call_errorhandler(
3853 errors, &errorHandler,
3854 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 &starts, &end, &startinpos, &endinpos, &exc, &s,
3856 &v, &outpos, &p))
3857 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 nextByte:
3860 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 Py_XDECREF(errorHandler);
3865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003867
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 Py_XDECREF(errorHandler);
3871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 return NULL;
3873}
3874
3875PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003878 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 char *p;
3880 char *q;
3881
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003882#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003883 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003884#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003885 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003886#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003887
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003888 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003889 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003890
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003891 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 if (repr == NULL)
3893 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003894 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003895 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003897 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 while (size-- > 0) {
3899 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003900#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 /* Map 32-bit characters to '\Uxxxxxxxx' */
3902 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003903 *p++ = '\\';
3904 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003905 *p++ = hexdigits[(ch >> 28) & 0xf];
3906 *p++ = hexdigits[(ch >> 24) & 0xf];
3907 *p++ = hexdigits[(ch >> 20) & 0xf];
3908 *p++ = hexdigits[(ch >> 16) & 0xf];
3909 *p++ = hexdigits[(ch >> 12) & 0xf];
3910 *p++ = hexdigits[(ch >> 8) & 0xf];
3911 *p++ = hexdigits[(ch >> 4) & 0xf];
3912 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003913 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003914 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003915#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3917 if (ch >= 0xD800 && ch < 0xDC00) {
3918 Py_UNICODE ch2;
3919 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003920
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 ch2 = *s++;
3922 size--;
3923 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3924 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3925 *p++ = '\\';
3926 *p++ = 'U';
3927 *p++ = hexdigits[(ucs >> 28) & 0xf];
3928 *p++ = hexdigits[(ucs >> 24) & 0xf];
3929 *p++ = hexdigits[(ucs >> 20) & 0xf];
3930 *p++ = hexdigits[(ucs >> 16) & 0xf];
3931 *p++ = hexdigits[(ucs >> 12) & 0xf];
3932 *p++ = hexdigits[(ucs >> 8) & 0xf];
3933 *p++ = hexdigits[(ucs >> 4) & 0xf];
3934 *p++ = hexdigits[ucs & 0xf];
3935 continue;
3936 }
3937 /* Fall through: isolated surrogates are copied as-is */
3938 s--;
3939 size++;
3940 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 /* Map 16-bit characters to '\uxxxx' */
3943 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 *p++ = '\\';
3945 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003946 *p++ = hexdigits[(ch >> 12) & 0xf];
3947 *p++ = hexdigits[(ch >> 8) & 0xf];
3948 *p++ = hexdigits[(ch >> 4) & 0xf];
3949 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 /* Copy everything else as-is */
3952 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 *p++ = (char) ch;
3954 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003955 size = p - q;
3956
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003957 assert(size > 0);
3958 if (_PyBytes_Resize(&repr, size) < 0)
3959 return NULL;
3960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961}
3962
3963PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3964{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003965 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003967 PyErr_BadArgument();
3968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003970 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3971 PyUnicode_GET_SIZE(unicode));
3972
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003973 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974}
3975
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003976/* --- Unicode Internal Codec ------------------------------------------- */
3977
3978PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 Py_ssize_t size,
3980 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003981{
3982 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003983 Py_ssize_t startinpos;
3984 Py_ssize_t endinpos;
3985 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003986 PyUnicodeObject *v;
3987 Py_UNICODE *p;
3988 const char *end;
3989 const char *reason;
3990 PyObject *errorHandler = NULL;
3991 PyObject *exc = NULL;
3992
Neal Norwitzd43069c2006-01-08 01:12:10 +00003993#ifdef Py_UNICODE_WIDE
3994 Py_UNICODE unimax = PyUnicode_GetMax();
3995#endif
3996
Thomas Wouters89f507f2006-12-13 04:49:30 +00003997 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003998 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3999 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004001 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004003 p = PyUnicode_AS_UNICODE(v);
4004 end = s + size;
4005
4006 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004007 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004008 /* We have to sanity check the raw data, otherwise doom looms for
4009 some malformed UCS-4 data. */
4010 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004011#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004012 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004013#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004014 end-s < Py_UNICODE_SIZE
4015 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004017 startinpos = s - starts;
4018 if (end-s < Py_UNICODE_SIZE) {
4019 endinpos = end-starts;
4020 reason = "truncated input";
4021 }
4022 else {
4023 endinpos = s - starts + Py_UNICODE_SIZE;
4024 reason = "illegal code point (> 0x10FFFF)";
4025 }
4026 outpos = p - PyUnicode_AS_UNICODE(v);
4027 if (unicode_decode_call_errorhandler(
4028 errors, &errorHandler,
4029 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004030 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004031 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004032 goto onError;
4033 }
4034 }
4035 else {
4036 p++;
4037 s += Py_UNICODE_SIZE;
4038 }
4039 }
4040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004041 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004042 goto onError;
4043 Py_XDECREF(errorHandler);
4044 Py_XDECREF(exc);
4045 return (PyObject *)v;
4046
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004048 Py_XDECREF(v);
4049 Py_XDECREF(errorHandler);
4050 Py_XDECREF(exc);
4051 return NULL;
4052}
4053
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054/* --- Latin-1 Codec ------------------------------------------------------ */
4055
4056PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 Py_ssize_t size,
4058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059{
4060 PyUnicodeObject *v;
4061 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004062 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004063
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004065 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 Py_UNICODE r = *(unsigned char*)s;
4067 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004068 }
4069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 v = _PyUnicode_New(size);
4071 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004072 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004076 e = s + size;
4077 /* Unrolling the copy makes it much faster by reducing the looping
4078 overhead. This is similar to what many memcpy() implementations do. */
4079 unrolled_end = e - 4;
4080 while (s < unrolled_end) {
4081 p[0] = (unsigned char) s[0];
4082 p[1] = (unsigned char) s[1];
4083 p[2] = (unsigned char) s[2];
4084 p[3] = (unsigned char) s[3];
4085 s += 4;
4086 p += 4;
4087 }
4088 while (s < e)
4089 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004091
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 Py_XDECREF(v);
4094 return NULL;
4095}
4096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097/* create or adjust a UnicodeEncodeError */
4098static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 const char *encoding,
4100 const Py_UNICODE *unicode, Py_ssize_t size,
4101 Py_ssize_t startpos, Py_ssize_t endpos,
4102 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 *exceptionObject = PyUnicodeEncodeError_Create(
4106 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 }
4108 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4110 goto onError;
4111 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4112 goto onError;
4113 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4114 goto onError;
4115 return;
4116 onError:
4117 Py_DECREF(*exceptionObject);
4118 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 }
4120}
4121
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122/* raises a UnicodeEncodeError */
4123static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 const char *encoding,
4125 const Py_UNICODE *unicode, Py_ssize_t size,
4126 Py_ssize_t startpos, Py_ssize_t endpos,
4127 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128{
4129 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133}
4134
4135/* error handling callback helper:
4136 build arguments, call the callback and check the arguments,
4137 put the result into newpos and return the replacement string, which
4138 has to be freed by the caller */
4139static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 PyObject **errorHandler,
4141 const char *encoding, const char *reason,
4142 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4143 Py_ssize_t startpos, Py_ssize_t endpos,
4144 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004146 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147
4148 PyObject *restuple;
4149 PyObject *resunicode;
4150
4151 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 }
4156
4157 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161
4162 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004167 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 Py_DECREF(restuple);
4169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004171 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 &resunicode, newpos)) {
4173 Py_DECREF(restuple);
4174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004176 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4177 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4178 Py_DECREF(restuple);
4179 return NULL;
4180 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004183 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4185 Py_DECREF(restuple);
4186 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 Py_INCREF(resunicode);
4189 Py_DECREF(restuple);
4190 return resunicode;
4191}
4192
4193static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 Py_ssize_t size,
4195 const char *errors,
4196 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197{
4198 /* output object */
4199 PyObject *res;
4200 /* pointers to the beginning and end+1 of input */
4201 const Py_UNICODE *startp = p;
4202 const Py_UNICODE *endp = p + size;
4203 /* pointer to the beginning of the unencodable characters */
4204 /* const Py_UNICODE *badp = NULL; */
4205 /* pointer into the output */
4206 char *str;
4207 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004208 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004209 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4210 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 PyObject *errorHandler = NULL;
4212 PyObject *exc = NULL;
4213 /* the following variable is used for caching string comparisons
4214 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4215 int known_errorHandler = -1;
4216
4217 /* allocate enough for a simple encoding without
4218 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004219 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004220 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004221 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004223 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004224 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 ressize = size;
4226
4227 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 /* can we encode this? */
4231 if (c<limit) {
4232 /* no overflow check, because we know that the space is enough */
4233 *str++ = (char)c;
4234 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004235 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 else {
4237 Py_ssize_t unicodepos = p-startp;
4238 Py_ssize_t requiredsize;
4239 PyObject *repunicode;
4240 Py_ssize_t repsize;
4241 Py_ssize_t newpos;
4242 Py_ssize_t respos;
4243 Py_UNICODE *uni2;
4244 /* startpos for collecting unencodable chars */
4245 const Py_UNICODE *collstart = p;
4246 const Py_UNICODE *collend = p;
4247 /* find all unecodable characters */
4248 while ((collend < endp) && ((*collend)>=limit))
4249 ++collend;
4250 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4251 if (known_errorHandler==-1) {
4252 if ((errors==NULL) || (!strcmp(errors, "strict")))
4253 known_errorHandler = 1;
4254 else if (!strcmp(errors, "replace"))
4255 known_errorHandler = 2;
4256 else if (!strcmp(errors, "ignore"))
4257 known_errorHandler = 3;
4258 else if (!strcmp(errors, "xmlcharrefreplace"))
4259 known_errorHandler = 4;
4260 else
4261 known_errorHandler = 0;
4262 }
4263 switch (known_errorHandler) {
4264 case 1: /* strict */
4265 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4266 goto onError;
4267 case 2: /* replace */
4268 while (collstart++<collend)
4269 *str++ = '?'; /* fall through */
4270 case 3: /* ignore */
4271 p = collend;
4272 break;
4273 case 4: /* xmlcharrefreplace */
4274 respos = str - PyBytes_AS_STRING(res);
4275 /* determine replacement size (temporarily (mis)uses p) */
4276 for (p = collstart, repsize = 0; p < collend; ++p) {
4277 if (*p<10)
4278 repsize += 2+1+1;
4279 else if (*p<100)
4280 repsize += 2+2+1;
4281 else if (*p<1000)
4282 repsize += 2+3+1;
4283 else if (*p<10000)
4284 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004285#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 else
4287 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004288#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 else if (*p<100000)
4290 repsize += 2+5+1;
4291 else if (*p<1000000)
4292 repsize += 2+6+1;
4293 else
4294 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004295#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 }
4297 requiredsize = respos+repsize+(endp-collend);
4298 if (requiredsize > ressize) {
4299 if (requiredsize<2*ressize)
4300 requiredsize = 2*ressize;
4301 if (_PyBytes_Resize(&res, requiredsize))
4302 goto onError;
4303 str = PyBytes_AS_STRING(res) + respos;
4304 ressize = requiredsize;
4305 }
4306 /* generate replacement (temporarily (mis)uses p) */
4307 for (p = collstart; p < collend; ++p) {
4308 str += sprintf(str, "&#%d;", (int)*p);
4309 }
4310 p = collend;
4311 break;
4312 default:
4313 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4314 encoding, reason, startp, size, &exc,
4315 collstart-startp, collend-startp, &newpos);
4316 if (repunicode == NULL)
4317 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004318 if (PyBytes_Check(repunicode)) {
4319 /* Directly copy bytes result to output. */
4320 repsize = PyBytes_Size(repunicode);
4321 if (repsize > 1) {
4322 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004323 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004324 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4325 Py_DECREF(repunicode);
4326 goto onError;
4327 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004328 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004329 ressize += repsize-1;
4330 }
4331 memcpy(str, PyBytes_AsString(repunicode), repsize);
4332 str += repsize;
4333 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004334 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004335 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 /* need more space? (at least enough for what we
4338 have+the replacement+the rest of the string, so
4339 we won't have to check space for encodable characters) */
4340 respos = str - PyBytes_AS_STRING(res);
4341 repsize = PyUnicode_GET_SIZE(repunicode);
4342 requiredsize = respos+repsize+(endp-collend);
4343 if (requiredsize > ressize) {
4344 if (requiredsize<2*ressize)
4345 requiredsize = 2*ressize;
4346 if (_PyBytes_Resize(&res, requiredsize)) {
4347 Py_DECREF(repunicode);
4348 goto onError;
4349 }
4350 str = PyBytes_AS_STRING(res) + respos;
4351 ressize = requiredsize;
4352 }
4353 /* check if there is anything unencodable in the replacement
4354 and copy it to the output */
4355 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4356 c = *uni2;
4357 if (c >= limit) {
4358 raise_encode_exception(&exc, encoding, startp, size,
4359 unicodepos, unicodepos+1, reason);
4360 Py_DECREF(repunicode);
4361 goto onError;
4362 }
4363 *str = (char)c;
4364 }
4365 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004366 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004368 }
4369 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004370 /* Resize if we allocated to much */
4371 size = str - PyBytes_AS_STRING(res);
4372 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004373 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004374 if (_PyBytes_Resize(&res, size) < 0)
4375 goto onError;
4376 }
4377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 Py_XDECREF(errorHandler);
4379 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004380 return res;
4381
4382 onError:
4383 Py_XDECREF(res);
4384 Py_XDECREF(errorHandler);
4385 Py_XDECREF(exc);
4386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387}
4388
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 Py_ssize_t size,
4391 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394}
4395
4396PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4397{
4398 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 PyErr_BadArgument();
4400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 }
4402 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 PyUnicode_GET_SIZE(unicode),
4404 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405}
4406
4407/* --- 7-bit ASCII Codec -------------------------------------------------- */
4408
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 Py_ssize_t size,
4411 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 PyUnicodeObject *v;
4415 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004416 Py_ssize_t startinpos;
4417 Py_ssize_t endinpos;
4418 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 const char *e;
4420 PyObject *errorHandler = NULL;
4421 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004422
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004424 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 Py_UNICODE r = *(unsigned char*)s;
4426 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004427 }
Tim Petersced69f82003-09-16 20:30:58 +00004428
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 v = _PyUnicode_New(size);
4430 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 e = s + size;
4436 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 register unsigned char c = (unsigned char)*s;
4438 if (c < 128) {
4439 *p++ = c;
4440 ++s;
4441 }
4442 else {
4443 startinpos = s-starts;
4444 endinpos = startinpos + 1;
4445 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4446 if (unicode_decode_call_errorhandler(
4447 errors, &errorHandler,
4448 "ascii", "ordinal not in range(128)",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
4450 &v, &outpos, &p))
4451 goto onError;
4452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004454 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4456 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 Py_XDECREF(errorHandler);
4458 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 Py_XDECREF(errorHandler);
4464 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 return NULL;
4466}
4467
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 Py_ssize_t size,
4470 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473}
4474
4475PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4476{
4477 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 PyErr_BadArgument();
4479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 }
4481 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 PyUnicode_GET_SIZE(unicode),
4483 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484}
4485
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004487
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004488/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004489
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004490#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004491#define NEED_RETRY
4492#endif
4493
4494/* XXX This code is limited to "true" double-byte encodings, as
4495 a) it assumes an incomplete character consists of a single byte, and
4496 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498
4499static int is_dbcs_lead_byte(const char *s, int offset)
4500{
4501 const char *curr = s + offset;
4502
4503 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 const char *prev = CharPrev(s, curr);
4505 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004506 }
4507 return 0;
4508}
4509
4510/*
4511 * Decode MBCS string into unicode object. If 'final' is set, converts
4512 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4513 */
4514static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 const char *s, /* MBCS string */
4516 int size, /* sizeof MBCS string */
4517 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004518{
4519 Py_UNICODE *p;
4520 Py_ssize_t n = 0;
4521 int usize = 0;
4522
4523 assert(size >= 0);
4524
4525 /* Skip trailing lead-byte unless 'final' is set */
4526 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528
4529 /* First get the size of the result */
4530 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4532 if (usize == 0) {
4533 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4534 return -1;
4535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004536 }
4537
4538 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 /* Create unicode object */
4540 *v = _PyUnicode_New(usize);
4541 if (*v == NULL)
4542 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004543 }
4544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 /* Extend unicode object */
4546 n = PyUnicode_GET_SIZE(*v);
4547 if (_PyUnicode_Resize(v, n + usize) < 0)
4548 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549 }
4550
4551 /* Do the conversion */
4552 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 p = PyUnicode_AS_UNICODE(*v) + n;
4554 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4555 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4556 return -1;
4557 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004558 }
4559
4560 return size;
4561}
4562
4563PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 Py_ssize_t size,
4565 const char *errors,
4566 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004567{
4568 PyUnicodeObject *v = NULL;
4569 int done;
4570
4571 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573
4574#ifdef NEED_RETRY
4575 retry:
4576 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578 else
4579#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581
4582 if (done < 0) {
4583 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004585 }
4586
4587 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004589
4590#ifdef NEED_RETRY
4591 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 s += done;
4593 size -= done;
4594 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004595 }
4596#endif
4597
4598 return (PyObject *)v;
4599}
4600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004601PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 Py_ssize_t size,
4603 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004604{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4606}
4607
4608/*
4609 * Convert unicode into string object (MBCS).
4610 * Returns 0 if succeed, -1 otherwise.
4611 */
4612static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 const Py_UNICODE *p, /* unicode */
4614 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004615{
4616 int mbcssize = 0;
4617 Py_ssize_t n = 0;
4618
4619 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004620
4621 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004622 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4624 if (mbcssize == 0) {
4625 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4626 return -1;
4627 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004628 }
4629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004630 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 /* Create string object */
4632 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4633 if (*repr == NULL)
4634 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004635 }
4636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 /* Extend string object */
4638 n = PyBytes_Size(*repr);
4639 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4640 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004641 }
4642
4643 /* Do the conversion */
4644 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 char *s = PyBytes_AS_STRING(*repr) + n;
4646 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4647 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4648 return -1;
4649 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004650 }
4651
4652 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004653}
4654
4655PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 Py_ssize_t size,
4657 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004658{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659 PyObject *repr = NULL;
4660 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004661
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004662#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004666 else
4667#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004669
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004670 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 Py_XDECREF(repr);
4672 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004673 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004674
4675#ifdef NEED_RETRY
4676 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 p += INT_MAX;
4678 size -= INT_MAX;
4679 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004680 }
4681#endif
4682
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004683 return repr;
4684}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004685
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004686PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4687{
4688 if (!PyUnicode_Check(unicode)) {
4689 PyErr_BadArgument();
4690 return NULL;
4691 }
4692 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 PyUnicode_GET_SIZE(unicode),
4694 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004695}
4696
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697#undef NEED_RETRY
4698
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004699#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701/* --- Character Mapping Codec -------------------------------------------- */
4702
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 Py_ssize_t size,
4705 PyObject *mapping,
4706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004709 Py_ssize_t startinpos;
4710 Py_ssize_t endinpos;
4711 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 PyUnicodeObject *v;
4714 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004715 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 PyObject *errorHandler = NULL;
4717 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004718 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004720
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 /* Default to Latin-1 */
4722 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
4725 v = _PyUnicode_New(size);
4726 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004732 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 mapstring = PyUnicode_AS_UNICODE(mapping);
4734 maplen = PyUnicode_GET_SIZE(mapping);
4735 while (s < e) {
4736 unsigned char ch = *s;
4737 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 if (ch < maplen)
4740 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 if (x == 0xfffe) {
4743 /* undefined mapping */
4744 outpos = p-PyUnicode_AS_UNICODE(v);
4745 startinpos = s-starts;
4746 endinpos = startinpos+1;
4747 if (unicode_decode_call_errorhandler(
4748 errors, &errorHandler,
4749 "charmap", "character maps to <undefined>",
4750 &starts, &e, &startinpos, &endinpos, &exc, &s,
4751 &v, &outpos, &p)) {
4752 goto onError;
4753 }
4754 continue;
4755 }
4756 *p++ = x;
4757 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004758 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004759 }
4760 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 while (s < e) {
4762 unsigned char ch = *s;
4763 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004764
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4766 w = PyLong_FromLong((long)ch);
4767 if (w == NULL)
4768 goto onError;
4769 x = PyObject_GetItem(mapping, w);
4770 Py_DECREF(w);
4771 if (x == NULL) {
4772 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4773 /* No mapping found means: mapping is undefined. */
4774 PyErr_Clear();
4775 x = Py_None;
4776 Py_INCREF(x);
4777 } else
4778 goto onError;
4779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004780
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 /* Apply mapping */
4782 if (PyLong_Check(x)) {
4783 long value = PyLong_AS_LONG(x);
4784 if (value < 0 || value > 65535) {
4785 PyErr_SetString(PyExc_TypeError,
4786 "character mapping must be in range(65536)");
4787 Py_DECREF(x);
4788 goto onError;
4789 }
4790 *p++ = (Py_UNICODE)value;
4791 }
4792 else if (x == Py_None) {
4793 /* undefined mapping */
4794 outpos = p-PyUnicode_AS_UNICODE(v);
4795 startinpos = s-starts;
4796 endinpos = startinpos+1;
4797 if (unicode_decode_call_errorhandler(
4798 errors, &errorHandler,
4799 "charmap", "character maps to <undefined>",
4800 &starts, &e, &startinpos, &endinpos, &exc, &s,
4801 &v, &outpos, &p)) {
4802 Py_DECREF(x);
4803 goto onError;
4804 }
4805 Py_DECREF(x);
4806 continue;
4807 }
4808 else if (PyUnicode_Check(x)) {
4809 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004810
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 if (targetsize == 1)
4812 /* 1-1 mapping */
4813 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004814
Benjamin Peterson29060642009-01-31 22:14:21 +00004815 else if (targetsize > 1) {
4816 /* 1-n mapping */
4817 if (targetsize > extrachars) {
4818 /* resize first */
4819 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4820 Py_ssize_t needed = (targetsize - extrachars) + \
4821 (targetsize << 2);
4822 extrachars += needed;
4823 /* XXX overflow detection missing */
4824 if (_PyUnicode_Resize(&v,
4825 PyUnicode_GET_SIZE(v) + needed) < 0) {
4826 Py_DECREF(x);
4827 goto onError;
4828 }
4829 p = PyUnicode_AS_UNICODE(v) + oldpos;
4830 }
4831 Py_UNICODE_COPY(p,
4832 PyUnicode_AS_UNICODE(x),
4833 targetsize);
4834 p += targetsize;
4835 extrachars -= targetsize;
4836 }
4837 /* 1-0 mapping: skip the character */
4838 }
4839 else {
4840 /* wrong return value */
4841 PyErr_SetString(PyExc_TypeError,
4842 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004843 Py_DECREF(x);
4844 goto onError;
4845 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 Py_DECREF(x);
4847 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 }
4850 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4852 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 Py_XDECREF(errorHandler);
4854 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004856
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 Py_XDECREF(errorHandler);
4859 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 Py_XDECREF(v);
4861 return NULL;
4862}
4863
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004864/* Charmap encoding: the lookup table */
4865
4866struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 PyObject_HEAD
4868 unsigned char level1[32];
4869 int count2, count3;
4870 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004871};
4872
4873static PyObject*
4874encoding_map_size(PyObject *obj, PyObject* args)
4875{
4876 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004877 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004879}
4880
4881static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004882 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 PyDoc_STR("Return the size (in bytes) of this object") },
4884 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004885};
4886
4887static void
4888encoding_map_dealloc(PyObject* o)
4889{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004890 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004891}
4892
4893static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004894 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 "EncodingMap", /*tp_name*/
4896 sizeof(struct encoding_map), /*tp_basicsize*/
4897 0, /*tp_itemsize*/
4898 /* methods */
4899 encoding_map_dealloc, /*tp_dealloc*/
4900 0, /*tp_print*/
4901 0, /*tp_getattr*/
4902 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004903 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 0, /*tp_repr*/
4905 0, /*tp_as_number*/
4906 0, /*tp_as_sequence*/
4907 0, /*tp_as_mapping*/
4908 0, /*tp_hash*/
4909 0, /*tp_call*/
4910 0, /*tp_str*/
4911 0, /*tp_getattro*/
4912 0, /*tp_setattro*/
4913 0, /*tp_as_buffer*/
4914 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4915 0, /*tp_doc*/
4916 0, /*tp_traverse*/
4917 0, /*tp_clear*/
4918 0, /*tp_richcompare*/
4919 0, /*tp_weaklistoffset*/
4920 0, /*tp_iter*/
4921 0, /*tp_iternext*/
4922 encoding_map_methods, /*tp_methods*/
4923 0, /*tp_members*/
4924 0, /*tp_getset*/
4925 0, /*tp_base*/
4926 0, /*tp_dict*/
4927 0, /*tp_descr_get*/
4928 0, /*tp_descr_set*/
4929 0, /*tp_dictoffset*/
4930 0, /*tp_init*/
4931 0, /*tp_alloc*/
4932 0, /*tp_new*/
4933 0, /*tp_free*/
4934 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004935};
4936
4937PyObject*
4938PyUnicode_BuildEncodingMap(PyObject* string)
4939{
4940 Py_UNICODE *decode;
4941 PyObject *result;
4942 struct encoding_map *mresult;
4943 int i;
4944 int need_dict = 0;
4945 unsigned char level1[32];
4946 unsigned char level2[512];
4947 unsigned char *mlevel1, *mlevel2, *mlevel3;
4948 int count2 = 0, count3 = 0;
4949
4950 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4951 PyErr_BadArgument();
4952 return NULL;
4953 }
4954 decode = PyUnicode_AS_UNICODE(string);
4955 memset(level1, 0xFF, sizeof level1);
4956 memset(level2, 0xFF, sizeof level2);
4957
4958 /* If there isn't a one-to-one mapping of NULL to \0,
4959 or if there are non-BMP characters, we need to use
4960 a mapping dictionary. */
4961 if (decode[0] != 0)
4962 need_dict = 1;
4963 for (i = 1; i < 256; i++) {
4964 int l1, l2;
4965 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004966#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004967 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004968#endif
4969 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004970 need_dict = 1;
4971 break;
4972 }
4973 if (decode[i] == 0xFFFE)
4974 /* unmapped character */
4975 continue;
4976 l1 = decode[i] >> 11;
4977 l2 = decode[i] >> 7;
4978 if (level1[l1] == 0xFF)
4979 level1[l1] = count2++;
4980 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004981 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004982 }
4983
4984 if (count2 >= 0xFF || count3 >= 0xFF)
4985 need_dict = 1;
4986
4987 if (need_dict) {
4988 PyObject *result = PyDict_New();
4989 PyObject *key, *value;
4990 if (!result)
4991 return NULL;
4992 for (i = 0; i < 256; i++) {
4993 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004994 key = PyLong_FromLong(decode[i]);
4995 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004996 if (!key || !value)
4997 goto failed1;
4998 if (PyDict_SetItem(result, key, value) == -1)
4999 goto failed1;
5000 Py_DECREF(key);
5001 Py_DECREF(value);
5002 }
5003 return result;
5004 failed1:
5005 Py_XDECREF(key);
5006 Py_XDECREF(value);
5007 Py_DECREF(result);
5008 return NULL;
5009 }
5010
5011 /* Create a three-level trie */
5012 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5013 16*count2 + 128*count3 - 1);
5014 if (!result)
5015 return PyErr_NoMemory();
5016 PyObject_Init(result, &EncodingMapType);
5017 mresult = (struct encoding_map*)result;
5018 mresult->count2 = count2;
5019 mresult->count3 = count3;
5020 mlevel1 = mresult->level1;
5021 mlevel2 = mresult->level23;
5022 mlevel3 = mresult->level23 + 16*count2;
5023 memcpy(mlevel1, level1, 32);
5024 memset(mlevel2, 0xFF, 16*count2);
5025 memset(mlevel3, 0, 128*count3);
5026 count3 = 0;
5027 for (i = 1; i < 256; i++) {
5028 int o1, o2, o3, i2, i3;
5029 if (decode[i] == 0xFFFE)
5030 /* unmapped character */
5031 continue;
5032 o1 = decode[i]>>11;
5033 o2 = (decode[i]>>7) & 0xF;
5034 i2 = 16*mlevel1[o1] + o2;
5035 if (mlevel2[i2] == 0xFF)
5036 mlevel2[i2] = count3++;
5037 o3 = decode[i] & 0x7F;
5038 i3 = 128*mlevel2[i2] + o3;
5039 mlevel3[i3] = i;
5040 }
5041 return result;
5042}
5043
5044static int
5045encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5046{
5047 struct encoding_map *map = (struct encoding_map*)mapping;
5048 int l1 = c>>11;
5049 int l2 = (c>>7) & 0xF;
5050 int l3 = c & 0x7F;
5051 int i;
5052
5053#ifdef Py_UNICODE_WIDE
5054 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005056 }
5057#endif
5058 if (c == 0)
5059 return 0;
5060 /* level 1*/
5061 i = map->level1[l1];
5062 if (i == 0xFF) {
5063 return -1;
5064 }
5065 /* level 2*/
5066 i = map->level23[16*i+l2];
5067 if (i == 0xFF) {
5068 return -1;
5069 }
5070 /* level 3 */
5071 i = map->level23[16*map->count2 + 128*i + l3];
5072 if (i == 0) {
5073 return -1;
5074 }
5075 return i;
5076}
5077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078/* Lookup the character ch in the mapping. If the character
5079 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005080 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082{
Christian Heimes217cfd12007-12-02 14:31:20 +00005083 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 PyObject *x;
5085
5086 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 x = PyObject_GetItem(mapping, w);
5089 Py_DECREF(w);
5090 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5092 /* No mapping found means: mapping is undefined. */
5093 PyErr_Clear();
5094 x = Py_None;
5095 Py_INCREF(x);
5096 return x;
5097 } else
5098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005100 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005102 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 long value = PyLong_AS_LONG(x);
5104 if (value < 0 || value > 255) {
5105 PyErr_SetString(PyExc_TypeError,
5106 "character mapping must be in range(256)");
5107 Py_DECREF(x);
5108 return NULL;
5109 }
5110 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005112 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 /* wrong return value */
5116 PyErr_Format(PyExc_TypeError,
5117 "character mapping must return integer, bytes or None, not %.400s",
5118 x->ob_type->tp_name);
5119 Py_DECREF(x);
5120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 }
5122}
5123
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005124static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005125charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005126{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005127 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5128 /* exponentially overallocate to minimize reallocations */
5129 if (requiredsize < 2*outsize)
5130 requiredsize = 2*outsize;
5131 if (_PyBytes_Resize(outobj, requiredsize))
5132 return -1;
5133 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005134}
5135
Benjamin Peterson14339b62009-01-31 16:36:08 +00005136typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005138}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005140 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 space is available. Return a new reference to the object that
5142 was put in the output buffer, or Py_None, if the mapping was undefined
5143 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005144 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005146charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005149 PyObject *rep;
5150 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005151 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152
Christian Heimes90aa7642007-12-19 02:45:37 +00005153 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005154 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005156 if (res == -1)
5157 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 if (outsize<requiredsize)
5159 if (charmapencode_resize(outobj, outpos, requiredsize))
5160 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005161 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 outstart[(*outpos)++] = (char)res;
5163 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005164 }
5165
5166 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005169 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_DECREF(rep);
5171 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005172 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 if (PyLong_Check(rep)) {
5174 Py_ssize_t requiredsize = *outpos+1;
5175 if (outsize<requiredsize)
5176 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5177 Py_DECREF(rep);
5178 return enc_EXCEPTION;
5179 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005180 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 else {
5184 const char *repchars = PyBytes_AS_STRING(rep);
5185 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5186 Py_ssize_t requiredsize = *outpos+repsize;
5187 if (outsize<requiredsize)
5188 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5189 Py_DECREF(rep);
5190 return enc_EXCEPTION;
5191 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005192 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 memcpy(outstart + *outpos, repchars, repsize);
5194 *outpos += repsize;
5195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005197 Py_DECREF(rep);
5198 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199}
5200
5201/* handle an error in PyUnicode_EncodeCharmap
5202 Return 0 on success, -1 on error */
5203static
5204int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005205 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005207 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005208 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209{
5210 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005211 Py_ssize_t repsize;
5212 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 Py_UNICODE *uni2;
5214 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005215 Py_ssize_t collstartpos = *inpos;
5216 Py_ssize_t collendpos = *inpos+1;
5217 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 char *encoding = "charmap";
5219 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005220 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 /* find all unencodable characters */
5223 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005224 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005225 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 int res = encoding_map_lookup(p[collendpos], mapping);
5227 if (res != -1)
5228 break;
5229 ++collendpos;
5230 continue;
5231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005232
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 rep = charmapencode_lookup(p[collendpos], mapping);
5234 if (rep==NULL)
5235 return -1;
5236 else if (rep!=Py_None) {
5237 Py_DECREF(rep);
5238 break;
5239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005240 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 }
5243 /* cache callback name lookup
5244 * (if not done yet, i.e. it's the first error) */
5245 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 if ((errors==NULL) || (!strcmp(errors, "strict")))
5247 *known_errorHandler = 1;
5248 else if (!strcmp(errors, "replace"))
5249 *known_errorHandler = 2;
5250 else if (!strcmp(errors, "ignore"))
5251 *known_errorHandler = 3;
5252 else if (!strcmp(errors, "xmlcharrefreplace"))
5253 *known_errorHandler = 4;
5254 else
5255 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256 }
5257 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005258 case 1: /* strict */
5259 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5260 return -1;
5261 case 2: /* replace */
5262 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 x = charmapencode_output('?', mapping, res, respos);
5264 if (x==enc_EXCEPTION) {
5265 return -1;
5266 }
5267 else if (x==enc_FAILED) {
5268 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5269 return -1;
5270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 }
5272 /* fall through */
5273 case 3: /* ignore */
5274 *inpos = collendpos;
5275 break;
5276 case 4: /* xmlcharrefreplace */
5277 /* generate replacement (temporarily (mis)uses p) */
5278 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 char buffer[2+29+1+1];
5280 char *cp;
5281 sprintf(buffer, "&#%d;", (int)p[collpos]);
5282 for (cp = buffer; *cp; ++cp) {
5283 x = charmapencode_output(*cp, mapping, res, respos);
5284 if (x==enc_EXCEPTION)
5285 return -1;
5286 else if (x==enc_FAILED) {
5287 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5288 return -1;
5289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290 }
5291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005292 *inpos = collendpos;
5293 break;
5294 default:
5295 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 encoding, reason, p, size, exceptionObject,
5297 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005300 if (PyBytes_Check(repunicode)) {
5301 /* Directly copy bytes result to output. */
5302 Py_ssize_t outsize = PyBytes_Size(*res);
5303 Py_ssize_t requiredsize;
5304 repsize = PyBytes_Size(repunicode);
5305 requiredsize = *respos + repsize;
5306 if (requiredsize > outsize)
5307 /* Make room for all additional bytes. */
5308 if (charmapencode_resize(res, respos, requiredsize)) {
5309 Py_DECREF(repunicode);
5310 return -1;
5311 }
5312 memcpy(PyBytes_AsString(*res) + *respos,
5313 PyBytes_AsString(repunicode), repsize);
5314 *respos += repsize;
5315 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005316 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005317 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005318 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005319 /* generate replacement */
5320 repsize = PyUnicode_GET_SIZE(repunicode);
5321 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 x = charmapencode_output(*uni2, mapping, res, respos);
5323 if (x==enc_EXCEPTION) {
5324 return -1;
5325 }
5326 else if (x==enc_FAILED) {
5327 Py_DECREF(repunicode);
5328 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5329 return -1;
5330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 }
5332 *inpos = newpos;
5333 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334 }
5335 return 0;
5336}
5337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 Py_ssize_t size,
5340 PyObject *mapping,
5341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005343 /* output object */
5344 PyObject *res = NULL;
5345 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 PyObject *errorHandler = NULL;
5350 PyObject *exc = NULL;
5351 /* the following variable is used for caching string comparisons
5352 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5353 * 3=ignore, 4=xmlcharrefreplace */
5354 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
5356 /* Default to Latin-1 */
5357 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 /* allocate enough for a simple encoding without
5361 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005362 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 if (res == NULL)
5364 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005365 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 /* try to encode it */
5370 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5371 if (x==enc_EXCEPTION) /* error */
5372 goto onError;
5373 if (x==enc_FAILED) { /* unencodable character */
5374 if (charmap_encoding_error(p, size, &inpos, mapping,
5375 &exc,
5376 &known_errorHandler, &errorHandler, errors,
5377 &res, &respos)) {
5378 goto onError;
5379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 else
5382 /* done with this character => adjust input position */
5383 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005387 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005388 if (_PyBytes_Resize(&res, respos) < 0)
5389 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 Py_XDECREF(exc);
5392 Py_XDECREF(errorHandler);
5393 return res;
5394
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 Py_XDECREF(res);
5397 Py_XDECREF(exc);
5398 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 return NULL;
5400}
5401
5402PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404{
5405 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 PyErr_BadArgument();
5407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
5409 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 PyUnicode_GET_SIZE(unicode),
5411 mapping,
5412 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413}
5414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415/* create or adjust a UnicodeTranslateError */
5416static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 const Py_UNICODE *unicode, Py_ssize_t size,
5418 Py_ssize_t startpos, Py_ssize_t endpos,
5419 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005422 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 }
5425 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5427 goto onError;
5428 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5429 goto onError;
5430 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5431 goto onError;
5432 return;
5433 onError:
5434 Py_DECREF(*exceptionObject);
5435 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 }
5437}
5438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439/* raises a UnicodeTranslateError */
5440static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 const Py_UNICODE *unicode, Py_ssize_t size,
5442 Py_ssize_t startpos, Py_ssize_t endpos,
5443 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444{
5445 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449}
5450
5451/* error handling callback helper:
5452 build arguments, call the callback and check the arguments,
5453 put the result into newpos and return the replacement string, which
5454 has to be freed by the caller */
5455static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 PyObject **errorHandler,
5457 const char *reason,
5458 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5459 Py_ssize_t startpos, Py_ssize_t endpos,
5460 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005462 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005464 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 PyObject *restuple;
5466 PyObject *resunicode;
5467
5468 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 }
5473
5474 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478
5479 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005484 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 Py_DECREF(restuple);
5486 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 }
5488 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 &resunicode, &i_newpos)) {
5490 Py_DECREF(restuple);
5491 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005493 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005495 else
5496 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005497 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5499 Py_DECREF(restuple);
5500 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005501 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 Py_INCREF(resunicode);
5503 Py_DECREF(restuple);
5504 return resunicode;
5505}
5506
5507/* Lookup the character ch in the mapping and put the result in result,
5508 which must be decrefed by the caller.
5509 Return 0 on success, -1 on error */
5510static
5511int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5512{
Christian Heimes217cfd12007-12-02 14:31:20 +00005513 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 PyObject *x;
5515
5516 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 x = PyObject_GetItem(mapping, w);
5519 Py_DECREF(w);
5520 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5522 /* No mapping found means: use 1:1 mapping. */
5523 PyErr_Clear();
5524 *result = NULL;
5525 return 0;
5526 } else
5527 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 }
5529 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 *result = x;
5531 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005533 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 long value = PyLong_AS_LONG(x);
5535 long max = PyUnicode_GetMax();
5536 if (value < 0 || value > max) {
5537 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005538 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 Py_DECREF(x);
5540 return -1;
5541 }
5542 *result = x;
5543 return 0;
5544 }
5545 else if (PyUnicode_Check(x)) {
5546 *result = x;
5547 return 0;
5548 }
5549 else {
5550 /* wrong return value */
5551 PyErr_SetString(PyExc_TypeError,
5552 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005553 Py_DECREF(x);
5554 return -1;
5555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005556}
5557/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 if not reallocate and adjust various state variables.
5559 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560static
Walter Dörwald4894c302003-10-24 14:25:28 +00005561int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005565 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 /* remember old output position */
5567 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5568 /* exponentially overallocate to minimize reallocations */
5569 if (requiredsize < 2 * oldsize)
5570 requiredsize = 2 * oldsize;
5571 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5572 return -1;
5573 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 }
5575 return 0;
5576}
5577/* lookup the character, put the result in the output string and adjust
5578 various state variables. Return a new reference to the object that
5579 was put in the output buffer in *result, or Py_None, if the mapping was
5580 undefined (in which case no character was written).
5581 The called must decref result.
5582 Return 0 on success, -1 on error. */
5583static
Walter Dörwald4894c302003-10-24 14:25:28 +00005584int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5586 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587{
Walter Dörwald4894c302003-10-24 14:25:28 +00005588 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 /* not found => default to 1:1 mapping */
5592 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593 }
5594 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005596 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 /* no overflow check, because we know that the space is enough */
5598 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 }
5600 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5602 if (repsize==1) {
5603 /* no overflow check, because we know that the space is enough */
5604 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5605 }
5606 else if (repsize!=0) {
5607 /* more than one character */
5608 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5609 (insize - (curinp-startinp)) +
5610 repsize - 1;
5611 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5612 return -1;
5613 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5614 *outp += repsize;
5615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 }
5617 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 return 0;
5620}
5621
5622PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 Py_ssize_t size,
5624 PyObject *mapping,
5625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 /* output object */
5628 PyObject *res = NULL;
5629 /* pointers to the beginning and end+1 of input */
5630 const Py_UNICODE *startp = p;
5631 const Py_UNICODE *endp = p + size;
5632 /* pointer into the output */
5633 Py_UNICODE *str;
5634 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 char *reason = "character maps to <undefined>";
5637 PyObject *errorHandler = NULL;
5638 PyObject *exc = NULL;
5639 /* the following variable is used for caching string comparisons
5640 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5641 * 3=ignore, 4=xmlcharrefreplace */
5642 int known_errorHandler = -1;
5643
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 PyErr_BadArgument();
5646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648
5649 /* allocate enough for a simple 1:1 translation without
5650 replacements, if we need more, we'll resize */
5651 res = PyUnicode_FromUnicode(NULL, size);
5652 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 /* try to encode it */
5660 PyObject *x = NULL;
5661 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5662 Py_XDECREF(x);
5663 goto onError;
5664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005665 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 if (x!=Py_None) /* it worked => adjust input pointer */
5667 ++p;
5668 else { /* untranslatable character */
5669 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5670 Py_ssize_t repsize;
5671 Py_ssize_t newpos;
5672 Py_UNICODE *uni2;
5673 /* startpos for collecting untranslatable chars */
5674 const Py_UNICODE *collstart = p;
5675 const Py_UNICODE *collend = p+1;
5676 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 /* find all untranslatable characters */
5679 while (collend < endp) {
5680 if (charmaptranslate_lookup(*collend, mapping, &x))
5681 goto onError;
5682 Py_XDECREF(x);
5683 if (x!=Py_None)
5684 break;
5685 ++collend;
5686 }
5687 /* cache callback name lookup
5688 * (if not done yet, i.e. it's the first error) */
5689 if (known_errorHandler==-1) {
5690 if ((errors==NULL) || (!strcmp(errors, "strict")))
5691 known_errorHandler = 1;
5692 else if (!strcmp(errors, "replace"))
5693 known_errorHandler = 2;
5694 else if (!strcmp(errors, "ignore"))
5695 known_errorHandler = 3;
5696 else if (!strcmp(errors, "xmlcharrefreplace"))
5697 known_errorHandler = 4;
5698 else
5699 known_errorHandler = 0;
5700 }
5701 switch (known_errorHandler) {
5702 case 1: /* strict */
5703 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005704 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 case 2: /* replace */
5706 /* No need to check for space, this is a 1:1 replacement */
5707 for (coll = collstart; coll<collend; ++coll)
5708 *str++ = '?';
5709 /* fall through */
5710 case 3: /* ignore */
5711 p = collend;
5712 break;
5713 case 4: /* xmlcharrefreplace */
5714 /* generate replacement (temporarily (mis)uses p) */
5715 for (p = collstart; p < collend; ++p) {
5716 char buffer[2+29+1+1];
5717 char *cp;
5718 sprintf(buffer, "&#%d;", (int)*p);
5719 if (charmaptranslate_makespace(&res, &str,
5720 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5721 goto onError;
5722 for (cp = buffer; *cp; ++cp)
5723 *str++ = *cp;
5724 }
5725 p = collend;
5726 break;
5727 default:
5728 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5729 reason, startp, size, &exc,
5730 collstart-startp, collend-startp, &newpos);
5731 if (repunicode == NULL)
5732 goto onError;
5733 /* generate replacement */
5734 repsize = PyUnicode_GET_SIZE(repunicode);
5735 if (charmaptranslate_makespace(&res, &str,
5736 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5737 Py_DECREF(repunicode);
5738 goto onError;
5739 }
5740 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5741 *str++ = *uni2;
5742 p = startp + newpos;
5743 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005745 }
5746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747 /* Resize if we allocated to much */
5748 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005749 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 if (PyUnicode_Resize(&res, respos) < 0)
5751 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 }
5753 Py_XDECREF(exc);
5754 Py_XDECREF(errorHandler);
5755 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(res);
5759 Py_XDECREF(exc);
5760 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return NULL;
5762}
5763
5764PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 PyObject *mapping,
5766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767{
5768 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 str = PyUnicode_FromObject(str);
5771 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 PyUnicode_GET_SIZE(str),
5775 mapping,
5776 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 Py_DECREF(str);
5778 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 Py_XDECREF(str);
5782 return NULL;
5783}
Tim Petersced69f82003-09-16 20:30:58 +00005784
Guido van Rossum9e896b32000-04-05 20:11:21 +00005785/* --- Decimal Encoder ---------------------------------------------------- */
5786
5787int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 Py_ssize_t length,
5789 char *output,
5790 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005791{
5792 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 PyObject *errorHandler = NULL;
5794 PyObject *exc = NULL;
5795 const char *encoding = "decimal";
5796 const char *reason = "invalid decimal Unicode string";
5797 /* the following variable is used for caching string comparisons
5798 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5799 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005800
5801 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 PyErr_BadArgument();
5803 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005804 }
5805
5806 p = s;
5807 end = s + length;
5808 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 register Py_UNICODE ch = *p;
5810 int decimal;
5811 PyObject *repunicode;
5812 Py_ssize_t repsize;
5813 Py_ssize_t newpos;
5814 Py_UNICODE *uni2;
5815 Py_UNICODE *collstart;
5816 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 ++p;
5821 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 decimal = Py_UNICODE_TODECIMAL(ch);
5824 if (decimal >= 0) {
5825 *output++ = '0' + decimal;
5826 ++p;
5827 continue;
5828 }
5829 if (0 < ch && ch < 256) {
5830 *output++ = (char)ch;
5831 ++p;
5832 continue;
5833 }
5834 /* All other characters are considered unencodable */
5835 collstart = p;
5836 collend = p+1;
5837 while (collend < end) {
5838 if ((0 < *collend && *collend < 256) ||
5839 !Py_UNICODE_ISSPACE(*collend) ||
5840 Py_UNICODE_TODECIMAL(*collend))
5841 break;
5842 }
5843 /* cache callback name lookup
5844 * (if not done yet, i.e. it's the first error) */
5845 if (known_errorHandler==-1) {
5846 if ((errors==NULL) || (!strcmp(errors, "strict")))
5847 known_errorHandler = 1;
5848 else if (!strcmp(errors, "replace"))
5849 known_errorHandler = 2;
5850 else if (!strcmp(errors, "ignore"))
5851 known_errorHandler = 3;
5852 else if (!strcmp(errors, "xmlcharrefreplace"))
5853 known_errorHandler = 4;
5854 else
5855 known_errorHandler = 0;
5856 }
5857 switch (known_errorHandler) {
5858 case 1: /* strict */
5859 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5860 goto onError;
5861 case 2: /* replace */
5862 for (p = collstart; p < collend; ++p)
5863 *output++ = '?';
5864 /* fall through */
5865 case 3: /* ignore */
5866 p = collend;
5867 break;
5868 case 4: /* xmlcharrefreplace */
5869 /* generate replacement (temporarily (mis)uses p) */
5870 for (p = collstart; p < collend; ++p)
5871 output += sprintf(output, "&#%d;", (int)*p);
5872 p = collend;
5873 break;
5874 default:
5875 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5876 encoding, reason, s, length, &exc,
5877 collstart-s, collend-s, &newpos);
5878 if (repunicode == NULL)
5879 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005880 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005881 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005882 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5883 Py_DECREF(repunicode);
5884 goto onError;
5885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 /* generate replacement */
5887 repsize = PyUnicode_GET_SIZE(repunicode);
5888 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5889 Py_UNICODE ch = *uni2;
5890 if (Py_UNICODE_ISSPACE(ch))
5891 *output++ = ' ';
5892 else {
5893 decimal = Py_UNICODE_TODECIMAL(ch);
5894 if (decimal >= 0)
5895 *output++ = '0' + decimal;
5896 else if (0 < ch && ch < 256)
5897 *output++ = (char)ch;
5898 else {
5899 Py_DECREF(repunicode);
5900 raise_encode_exception(&exc, encoding,
5901 s, length, collstart-s, collend-s, reason);
5902 goto onError;
5903 }
5904 }
5905 }
5906 p = s + newpos;
5907 Py_DECREF(repunicode);
5908 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005909 }
5910 /* 0-terminate the output string */
5911 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 Py_XDECREF(exc);
5913 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005914 return 0;
5915
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 Py_XDECREF(exc);
5918 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005919 return -1;
5920}
5921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922/* --- Helpers ------------------------------------------------------------ */
5923
Eric Smith8c663262007-08-25 02:26:07 +00005924#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005926
Thomas Wouters477c8d52006-05-27 19:21:47 +00005927#include "stringlib/count.h"
5928#include "stringlib/find.h"
5929#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005930#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005931
Eric Smith5807c412008-05-11 21:00:57 +00005932#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005933#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005934#include "stringlib/localeutil.h"
5935
Thomas Wouters477c8d52006-05-27 19:21:47 +00005936/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005937#define ADJUST_INDICES(start, end, len) \
5938 if (end > len) \
5939 end = len; \
5940 else if (end < 0) { \
5941 end += len; \
5942 if (end < 0) \
5943 end = 0; \
5944 } \
5945 if (start < 0) { \
5946 start += len; \
5947 if (start < 0) \
5948 start = 0; \
5949 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005950
Martin v. Löwis18e16552006-02-15 17:27:45 +00005951Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005952 PyObject *substr,
5953 Py_ssize_t start,
5954 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005957 PyUnicodeObject* str_obj;
5958 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005959
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5961 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005963 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5964 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 Py_DECREF(str_obj);
5966 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 }
Tim Petersced69f82003-09-16 20:30:58 +00005968
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005969 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005970 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005971 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5972 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973 );
5974
5975 Py_DECREF(sub_obj);
5976 Py_DECREF(str_obj);
5977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 return result;
5979}
5980
Martin v. Löwis18e16552006-02-15 17:27:45 +00005981Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005982 PyObject *sub,
5983 Py_ssize_t start,
5984 Py_ssize_t end,
5985 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005987 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 sub = PyUnicode_FromObject(sub);
5993 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 Py_DECREF(str);
5995 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
Tim Petersced69f82003-09-16 20:30:58 +00005997
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 if (direction > 0)
5999 result = stringlib_find_slice(
6000 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6001 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6002 start, end
6003 );
6004 else
6005 result = stringlib_rfind_slice(
6006 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6007 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6008 start, end
6009 );
6010
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006012 Py_DECREF(sub);
6013
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 return result;
6015}
6016
Tim Petersced69f82003-09-16 20:30:58 +00006017static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 PyUnicodeObject *substring,
6020 Py_ssize_t start,
6021 Py_ssize_t end,
6022 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 if (substring->length == 0)
6025 return 1;
6026
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006027 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 end -= substring->length;
6029 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
6032 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 if (Py_UNICODE_MATCH(self, end, substring))
6034 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 } else {
6036 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 }
6039
6040 return 0;
6041}
6042
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 PyObject *substr,
6045 Py_ssize_t start,
6046 Py_ssize_t end,
6047 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006049 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 str = PyUnicode_FromObject(str);
6052 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 substr = PyUnicode_FromObject(substr);
6055 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 Py_DECREF(str);
6057 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Tim Petersced69f82003-09-16 20:30:58 +00006059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 (PyUnicodeObject *)substr,
6062 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 Py_DECREF(str);
6064 Py_DECREF(substr);
6065 return result;
6066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068/* Apply fixfct filter to the Unicode object self and return a
6069 reference to the modified object */
6070
Tim Petersced69f82003-09-16 20:30:58 +00006071static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
6075
6076 PyUnicodeObject *u;
6077
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006078 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006081
6082 Py_UNICODE_COPY(u->str, self->str, self->length);
6083
Tim Peters7a29bd52001-09-12 03:03:31 +00006084 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 /* fixfct should return TRUE if it modified the buffer. If
6086 FALSE, return a reference to the original buffer instead
6087 (to save space, not time) */
6088 Py_INCREF(self);
6089 Py_DECREF(u);
6090 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 }
6092 return (PyObject*) u;
6093}
6094
Tim Petersced69f82003-09-16 20:30:58 +00006095static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096int fixupper(PyUnicodeObject *self)
6097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006098 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 Py_UNICODE *s = self->str;
6100 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006104
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 ch = Py_UNICODE_TOUPPER(*s);
6106 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 *s = ch;
6109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 s++;
6111 }
6112
6113 return status;
6114}
6115
Tim Petersced69f82003-09-16 20:30:58 +00006116static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117int fixlower(PyUnicodeObject *self)
6118{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006119 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 Py_UNICODE *s = self->str;
6121 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006122
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006125
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 ch = Py_UNICODE_TOLOWER(*s);
6127 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 *s = ch;
6130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 s++;
6132 }
6133
6134 return status;
6135}
6136
Tim Petersced69f82003-09-16 20:30:58 +00006137static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138int fixswapcase(PyUnicodeObject *self)
6139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 Py_UNICODE *s = self->str;
6142 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 while (len-- > 0) {
6145 if (Py_UNICODE_ISUPPER(*s)) {
6146 *s = Py_UNICODE_TOLOWER(*s);
6147 status = 1;
6148 } else if (Py_UNICODE_ISLOWER(*s)) {
6149 *s = Py_UNICODE_TOUPPER(*s);
6150 status = 1;
6151 }
6152 s++;
6153 }
6154
6155 return status;
6156}
6157
Tim Petersced69f82003-09-16 20:30:58 +00006158static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159int fixcapitalize(PyUnicodeObject *self)
6160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006161 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006162 Py_UNICODE *s = self->str;
6163 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006164
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006165 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006167 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 *s = Py_UNICODE_TOUPPER(*s);
6169 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006171 s++;
6172 while (--len > 0) {
6173 if (Py_UNICODE_ISUPPER(*s)) {
6174 *s = Py_UNICODE_TOLOWER(*s);
6175 status = 1;
6176 }
6177 s++;
6178 }
6179 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
6182static
6183int fixtitle(PyUnicodeObject *self)
6184{
6185 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6186 register Py_UNICODE *e;
6187 int previous_is_cased;
6188
6189 /* Shortcut for single character strings */
6190 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6192 if (*p != ch) {
6193 *p = ch;
6194 return 1;
6195 }
6196 else
6197 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 }
Tim Petersced69f82003-09-16 20:30:58 +00006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 e = p + PyUnicode_GET_SIZE(self);
6201 previous_is_cased = 0;
6202 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 if (previous_is_cased)
6206 *p = Py_UNICODE_TOLOWER(ch);
6207 else
6208 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 if (Py_UNICODE_ISLOWER(ch) ||
6211 Py_UNICODE_ISUPPER(ch) ||
6212 Py_UNICODE_ISTITLE(ch))
6213 previous_is_cased = 1;
6214 else
6215 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 }
6217 return 1;
6218}
6219
Tim Peters8ce9f162004-08-27 01:49:32 +00006220PyObject *
6221PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
Skip Montanaro6543b452004-09-16 03:28:13 +00006223 const Py_UNICODE blank = ' ';
6224 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006225 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006226 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006227 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6228 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006229 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6230 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006231 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006232 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Tim Peters05eba1f2004-08-27 21:32:02 +00006234 fseq = PySequence_Fast(seq, "");
6235 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006236 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006237 }
6238
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006239 /* NOTE: the following code can't call back into Python code,
6240 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006241 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006242
Tim Peters05eba1f2004-08-27 21:32:02 +00006243 seqlen = PySequence_Fast_GET_SIZE(fseq);
6244 /* If empty sequence, return u"". */
6245 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6247 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006248 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006249 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006250 /* If singleton sequence with an exact Unicode, return that. */
6251 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 item = items[0];
6253 if (PyUnicode_CheckExact(item)) {
6254 Py_INCREF(item);
6255 res = (PyUnicodeObject *)item;
6256 goto Done;
6257 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006258 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006259 else {
6260 /* Set up sep and seplen */
6261 if (separator == NULL) {
6262 sep = &blank;
6263 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006264 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006265 else {
6266 if (!PyUnicode_Check(separator)) {
6267 PyErr_Format(PyExc_TypeError,
6268 "separator: expected str instance,"
6269 " %.80s found",
6270 Py_TYPE(separator)->tp_name);
6271 goto onError;
6272 }
6273 sep = PyUnicode_AS_UNICODE(separator);
6274 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006275 }
6276 }
6277
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006278 /* There are at least two things to join, or else we have a subclass
6279 * of str in the sequence.
6280 * Do a pre-pass to figure out the total amount of space we'll
6281 * need (sz), and see whether all argument are strings.
6282 */
6283 sz = 0;
6284 for (i = 0; i < seqlen; i++) {
6285 const Py_ssize_t old_sz = sz;
6286 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 if (!PyUnicode_Check(item)) {
6288 PyErr_Format(PyExc_TypeError,
6289 "sequence item %zd: expected str instance,"
6290 " %.80s found",
6291 i, Py_TYPE(item)->tp_name);
6292 goto onError;
6293 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006294 sz += PyUnicode_GET_SIZE(item);
6295 if (i != 0)
6296 sz += seplen;
6297 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6298 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006300 goto onError;
6301 }
6302 }
Tim Petersced69f82003-09-16 20:30:58 +00006303
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006304 res = _PyUnicode_New(sz);
6305 if (res == NULL)
6306 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006307
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006308 /* Catenate everything. */
6309 res_p = PyUnicode_AS_UNICODE(res);
6310 for (i = 0; i < seqlen; ++i) {
6311 Py_ssize_t itemlen;
6312 item = items[i];
6313 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 /* Copy item, and maybe the separator. */
6315 if (i) {
6316 Py_UNICODE_COPY(res_p, sep, seplen);
6317 res_p += seplen;
6318 }
6319 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6320 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006321 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006322
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006324 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 return (PyObject *)res;
6326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006328 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006329 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 return NULL;
6331}
6332
Tim Petersced69f82003-09-16 20:30:58 +00006333static
6334PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 Py_ssize_t left,
6336 Py_ssize_t right,
6337 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
6339 PyUnicodeObject *u;
6340
6341 if (left < 0)
6342 left = 0;
6343 if (right < 0)
6344 right = 0;
6345
Tim Peters7a29bd52001-09-12 03:03:31 +00006346 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 Py_INCREF(self);
6348 return self;
6349 }
6350
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006351 if (left > PY_SSIZE_T_MAX - self->length ||
6352 right > PY_SSIZE_T_MAX - (left + self->length)) {
6353 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6354 return NULL;
6355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 u = _PyUnicode_New(left + self->length + right);
6357 if (u) {
6358 if (left)
6359 Py_UNICODE_FILL(u->str, fill, left);
6360 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6361 if (right)
6362 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6363 }
6364
6365 return u;
6366}
6367
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006368PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372 string = PyUnicode_FromObject(string);
6373 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006376 list = stringlib_splitlines(
6377 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6378 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379
6380 Py_DECREF(string);
6381 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382}
6383
Tim Petersced69f82003-09-16 20:30:58 +00006384static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 PyUnicodeObject *substring,
6387 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006390 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006393 return stringlib_split_whitespace(
6394 (PyObject*) self, self->str, self->length, maxcount
6395 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006397 return stringlib_split(
6398 (PyObject*) self, self->str, self->length,
6399 substring->str, substring->length,
6400 maxcount
6401 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402}
6403
Tim Petersced69f82003-09-16 20:30:58 +00006404static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006405PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 PyUnicodeObject *substring,
6407 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006408{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006409 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006410 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006411
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006412 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006413 return stringlib_rsplit_whitespace(
6414 (PyObject*) self, self->str, self->length, maxcount
6415 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006416
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006417 return stringlib_rsplit(
6418 (PyObject*) self, self->str, self->length,
6419 substring->str, substring->length,
6420 maxcount
6421 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422}
6423
6424static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 PyUnicodeObject *str1,
6427 PyUnicodeObject *str2,
6428 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
6430 PyUnicodeObject *u;
6431
6432 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006434 else if (maxcount == 0 || self->length == 0)
6435 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
Thomas Wouters477c8d52006-05-27 19:21:47 +00006437 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006438 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006440 if (str1->length == 0)
6441 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442 if (str1->length == 1) {
6443 /* replace characters */
6444 Py_UNICODE u1, u2;
6445 if (!findchar(self->str, self->length, str1->str[0]))
6446 goto nothing;
6447 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6448 if (!u)
6449 return NULL;
6450 Py_UNICODE_COPY(u->str, self->str, self->length);
6451 u1 = str1->str[0];
6452 u2 = str2->str[0];
6453 for (i = 0; i < u->length; i++)
6454 if (u->str[i] == u1) {
6455 if (--maxcount < 0)
6456 break;
6457 u->str[i] = u2;
6458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006460 i = stringlib_find(
6461 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006463 if (i < 0)
6464 goto nothing;
6465 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6466 if (!u)
6467 return NULL;
6468 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006469
6470 /* change everything in-place, starting with this one */
6471 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6472 i += str1->length;
6473
6474 while ( --maxcount > 0) {
6475 i = stringlib_find(self->str+i, self->length-i,
6476 str1->str, str1->length,
6477 i);
6478 if (i == -1)
6479 break;
6480 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6481 i += str1->length;
6482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485
6486 Py_ssize_t n, i, j, e;
6487 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 Py_UNICODE *p;
6489
6490 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006491 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6492 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006493 if (n == 0)
6494 goto nothing;
6495 /* new_size = self->length + n * (str2->length - str1->length)); */
6496 delta = (str2->length - str1->length);
6497 if (delta == 0) {
6498 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006500 product = n * (str2->length - str1->length);
6501 if ((product / (str2->length - str1->length)) != n) {
6502 PyErr_SetString(PyExc_OverflowError,
6503 "replace string is too long");
6504 return NULL;
6505 }
6506 new_size = self->length + product;
6507 if (new_size < 0) {
6508 PyErr_SetString(PyExc_OverflowError,
6509 "replace string is too long");
6510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 }
6512 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 u = _PyUnicode_New(new_size);
6514 if (!u)
6515 return NULL;
6516 i = 0;
6517 p = u->str;
6518 e = self->length - str1->length;
6519 if (str1->length > 0) {
6520 while (n-- > 0) {
6521 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006522 j = stringlib_find(self->str+i, self->length-i,
6523 str1->str, str1->length,
6524 i);
6525 if (j == -1)
6526 break;
6527 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006528 /* copy unchanged part [i:j] */
6529 Py_UNICODE_COPY(p, self->str+i, j-i);
6530 p += j - i;
6531 }
6532 /* copy substitution string */
6533 if (str2->length > 0) {
6534 Py_UNICODE_COPY(p, str2->str, str2->length);
6535 p += str2->length;
6536 }
6537 i = j + str1->length;
6538 }
6539 if (i < self->length)
6540 /* copy tail [i:] */
6541 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6542 } else {
6543 /* interleave */
6544 while (n > 0) {
6545 Py_UNICODE_COPY(p, str2->str, str2->length);
6546 p += str2->length;
6547 if (--n <= 0)
6548 break;
6549 *p++ = self->str[i++];
6550 }
6551 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006555
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006557 /* nothing to replace; return original string (when possible) */
6558 if (PyUnicode_CheckExact(self)) {
6559 Py_INCREF(self);
6560 return (PyObject *) self;
6561 }
6562 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
6565/* --- Unicode Object Methods --------------------------------------------- */
6566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569\n\
6570Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006574unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 return fixup(self, fixtitle);
6577}
6578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006579PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581\n\
6582Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006586unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return fixup(self, fixcapitalize);
6589}
6590
6591#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594\n\
6595Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006599unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
6601 PyObject *list;
6602 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 /* Split into words */
6606 list = split(self, NULL, -1);
6607 if (!list)
6608 return NULL;
6609
6610 /* Capitalize each word */
6611 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6612 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 if (item == NULL)
6615 goto onError;
6616 Py_DECREF(PyList_GET_ITEM(list, i));
6617 PyList_SET_ITEM(list, i, item);
6618 }
6619
6620 /* Join the words to form a new string */
6621 item = PyUnicode_Join(NULL, list);
6622
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 Py_DECREF(list);
6625 return (PyObject *)item;
6626}
6627#endif
6628
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006629/* Argument converter. Coerces to a single unicode character */
6630
6631static int
6632convert_uc(PyObject *obj, void *addr)
6633{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006634 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6635 PyObject *uniobj;
6636 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006637
Benjamin Peterson14339b62009-01-31 16:36:08 +00006638 uniobj = PyUnicode_FromObject(obj);
6639 if (uniobj == NULL) {
6640 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006642 return 0;
6643 }
6644 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6645 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006647 Py_DECREF(uniobj);
6648 return 0;
6649 }
6650 unistr = PyUnicode_AS_UNICODE(uniobj);
6651 *fillcharloc = unistr[0];
6652 Py_DECREF(uniobj);
6653 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006654}
6655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006656PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006659Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006660done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
6662static PyObject *
6663unicode_center(PyUnicodeObject *self, PyObject *args)
6664{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006665 Py_ssize_t marg, left;
6666 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006667 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
Thomas Woutersde017742006-02-16 19:34:37 +00006669 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 return NULL;
6671
Tim Peters7a29bd52001-09-12 03:03:31 +00006672 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 Py_INCREF(self);
6674 return (PyObject*) self;
6675 }
6676
6677 marg = width - self->length;
6678 left = marg / 2 + (marg & width & 1);
6679
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006680 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
Marc-André Lemburge5034372000-08-08 08:04:29 +00006683#if 0
6684
6685/* This code should go into some future Unicode collation support
6686 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006687 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006688
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006689/* speedy UTF-16 code point order comparison */
6690/* gleaned from: */
6691/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6692
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006693static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006694{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006695 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006696 0, 0, 0, 0, 0, 0, 0, 0,
6697 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006698 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006699};
6700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701static int
6702unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6703{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006704 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 Py_UNICODE *s1 = str1->str;
6707 Py_UNICODE *s2 = str2->str;
6708
6709 len1 = str1->length;
6710 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006711
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006713 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006714
6715 c1 = *s1++;
6716 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006717
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 if (c1 > (1<<11) * 26)
6719 c1 += utf16Fixup[c1>>11];
6720 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006721 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006722 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006723
6724 if (c1 != c2)
6725 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006726
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006727 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 }
6729
6730 return (len1 < len2) ? -1 : (len1 != len2);
6731}
6732
Marc-André Lemburge5034372000-08-08 08:04:29 +00006733#else
6734
6735static int
6736unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6737{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006738 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006739
6740 Py_UNICODE *s1 = str1->str;
6741 Py_UNICODE *s2 = str2->str;
6742
6743 len1 = str1->length;
6744 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006745
Marc-André Lemburge5034372000-08-08 08:04:29 +00006746 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006747 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006748
Fredrik Lundh45714e92001-06-26 16:39:36 +00006749 c1 = *s1++;
6750 c2 = *s2++;
6751
6752 if (c1 != c2)
6753 return (c1 < c2) ? -1 : 1;
6754
Marc-André Lemburge5034372000-08-08 08:04:29 +00006755 len1--; len2--;
6756 }
6757
6758 return (len1 < len2) ? -1 : (len1 != len2);
6759}
6760
6761#endif
6762
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006766 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6767 return unicode_compare((PyUnicodeObject *)left,
6768 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006769 PyErr_Format(PyExc_TypeError,
6770 "Can't compare %.100s and %.100s",
6771 left->ob_type->tp_name,
6772 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 return -1;
6774}
6775
Martin v. Löwis5b222132007-06-10 09:51:05 +00006776int
6777PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6778{
6779 int i;
6780 Py_UNICODE *id;
6781 assert(PyUnicode_Check(uni));
6782 id = PyUnicode_AS_UNICODE(uni);
6783 /* Compare Unicode string and source character set string */
6784 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 if (id[i] != str[i])
6786 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006787 /* This check keeps Python strings that end in '\0' from comparing equal
6788 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006789 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006791 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006793 return 0;
6794}
6795
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006796
Benjamin Peterson29060642009-01-31 22:14:21 +00006797#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006798 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006799
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006800PyObject *PyUnicode_RichCompare(PyObject *left,
6801 PyObject *right,
6802 int op)
6803{
6804 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006805
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006806 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6807 PyObject *v;
6808 if (((PyUnicodeObject *) left)->length !=
6809 ((PyUnicodeObject *) right)->length) {
6810 if (op == Py_EQ) {
6811 Py_INCREF(Py_False);
6812 return Py_False;
6813 }
6814 if (op == Py_NE) {
6815 Py_INCREF(Py_True);
6816 return Py_True;
6817 }
6818 }
6819 if (left == right)
6820 result = 0;
6821 else
6822 result = unicode_compare((PyUnicodeObject *)left,
6823 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006824
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006825 /* Convert the return value to a Boolean */
6826 switch (op) {
6827 case Py_EQ:
6828 v = TEST_COND(result == 0);
6829 break;
6830 case Py_NE:
6831 v = TEST_COND(result != 0);
6832 break;
6833 case Py_LE:
6834 v = TEST_COND(result <= 0);
6835 break;
6836 case Py_GE:
6837 v = TEST_COND(result >= 0);
6838 break;
6839 case Py_LT:
6840 v = TEST_COND(result == -1);
6841 break;
6842 case Py_GT:
6843 v = TEST_COND(result == 1);
6844 break;
6845 default:
6846 PyErr_BadArgument();
6847 return NULL;
6848 }
6849 Py_INCREF(v);
6850 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006853 Py_INCREF(Py_NotImplemented);
6854 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006855}
6856
Guido van Rossum403d68b2000-03-13 15:55:09 +00006857int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006859{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006860 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006862
6863 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006864 sub = PyUnicode_FromObject(element);
6865 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 PyErr_Format(PyExc_TypeError,
6867 "'in <string>' requires string as left operand, not %s",
6868 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006869 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006870 }
6871
Thomas Wouters477c8d52006-05-27 19:21:47 +00006872 str = PyUnicode_FromObject(container);
6873 if (!str) {
6874 Py_DECREF(sub);
6875 return -1;
6876 }
6877
6878 result = stringlib_contains_obj(str, sub);
6879
6880 Py_DECREF(str);
6881 Py_DECREF(sub);
6882
Guido van Rossum403d68b2000-03-13 15:55:09 +00006883 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006884}
6885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886/* Concat to string or Unicode object giving a new Unicode object. */
6887
6888PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
6891 PyUnicodeObject *u = NULL, *v = NULL, *w;
6892
6893 /* Coerce the two arguments */
6894 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6895 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6898 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901 /* Shortcuts */
6902 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 Py_DECREF(v);
6904 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
6906 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 Py_DECREF(u);
6908 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
6910
6911 /* Concat the two Unicode strings */
6912 w = _PyUnicode_New(u->length + v->length);
6913 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 Py_UNICODE_COPY(w->str, u->str, u->length);
6916 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6917
6918 Py_DECREF(u);
6919 Py_DECREF(v);
6920 return (PyObject *)w;
6921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 Py_XDECREF(u);
6924 Py_XDECREF(v);
6925 return NULL;
6926}
6927
Walter Dörwald1ab83302007-05-18 17:15:44 +00006928void
6929PyUnicode_Append(PyObject **pleft, PyObject *right)
6930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006931 PyObject *new;
6932 if (*pleft == NULL)
6933 return;
6934 if (right == NULL || !PyUnicode_Check(*pleft)) {
6935 Py_DECREF(*pleft);
6936 *pleft = NULL;
6937 return;
6938 }
6939 new = PyUnicode_Concat(*pleft, right);
6940 Py_DECREF(*pleft);
6941 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006942}
6943
6944void
6945PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6946{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947 PyUnicode_Append(pleft, right);
6948 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006954Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006955string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
6958static PyObject *
6959unicode_count(PyUnicodeObject *self, PyObject *args)
6960{
6961 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006962 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006963 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 PyObject *result;
6965
Guido van Rossumb8872e62000-05-09 14:14:27 +00006966 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 return NULL;
6969
6970 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006971 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006974
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006975 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006976 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006977 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006978 substring->str, substring->length,
6979 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
6982 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 return result;
6985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006990Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006991to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006992handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006993a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6994'xmlcharrefreplace' as well as any other name registered with\n\
6995codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006998unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007000 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 char *encoding = NULL;
7002 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007003 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007004
Benjamin Peterson308d6372009-09-18 21:42:35 +00007005 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7006 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007008 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007009 if (v == NULL)
7010 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007011 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007012 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007013 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007014 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007015 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007016 Py_DECREF(v);
7017 return NULL;
7018 }
7019 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007022 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
7028Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
7032unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7033{
7034 Py_UNICODE *e;
7035 Py_UNICODE *p;
7036 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007037 Py_UNICODE *qe;
7038 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 PyUnicodeObject *u;
7040 int tabsize = 8;
7041
7042 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
Thomas Wouters7e474022000-07-16 12:04:32 +00007045 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007046 i = 0; /* chars up to and including most recent \n or \r */
7047 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7048 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 for (p = self->str; p < e; p++)
7050 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 if (tabsize > 0) {
7052 incr = tabsize - (j % tabsize); /* cannot overflow */
7053 if (j > PY_SSIZE_T_MAX - incr)
7054 goto overflow1;
7055 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 if (j > PY_SSIZE_T_MAX - 1)
7060 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 j++;
7062 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 if (i > PY_SSIZE_T_MAX - j)
7064 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 }
7068 }
7069
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007070 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007072
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 /* Second pass: create output string and fill it */
7074 u = _PyUnicode_New(i + j);
7075 if (!u)
7076 return NULL;
7077
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007078 j = 0; /* same as in first pass */
7079 q = u->str; /* next output char */
7080 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
7082 for (p = self->str; p < e; p++)
7083 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 if (tabsize > 0) {
7085 i = tabsize - (j % tabsize);
7086 j += i;
7087 while (i--) {
7088 if (q >= qe)
7089 goto overflow2;
7090 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 else {
7095 if (q >= qe)
7096 goto overflow2;
7097 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007098 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 if (*p == '\n' || *p == '\r')
7100 j = 0;
7101 }
7102
7103 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007104
7105 overflow2:
7106 Py_DECREF(u);
7107 overflow1:
7108 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
7115Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007116such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117arguments start and end are interpreted as in slice notation.\n\
7118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject *
7122unicode_find(PyUnicodeObject *self, PyObject *args)
7123{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007125 Py_ssize_t start;
7126 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007127 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
Christian Heimes9cd17752007-11-18 19:35:23 +00007129 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
Thomas Wouters477c8d52006-05-27 19:21:47 +00007132 result = stringlib_find_slice(
7133 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7134 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7135 start, end
7136 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
7138 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007139
Christian Heimes217cfd12007-12-02 14:31:20 +00007140 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
7143static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145{
7146 if (index < 0 || index >= self->length) {
7147 PyErr_SetString(PyExc_IndexError, "string index out of range");
7148 return NULL;
7149 }
7150
7151 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7152}
7153
Guido van Rossumc2504932007-09-18 19:42:40 +00007154/* Believe it or not, this produces the same value for ASCII strings
7155 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007157unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
Guido van Rossumc2504932007-09-18 19:42:40 +00007159 Py_ssize_t len;
7160 Py_UNICODE *p;
7161 long x;
7162
7163 if (self->hash != -1)
7164 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007165 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007166 p = self->str;
7167 x = *p << 7;
7168 while (--len >= 0)
7169 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007170 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007171 if (x == -1)
7172 x = -2;
7173 self->hash = x;
7174 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175}
7176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182static PyObject *
7183unicode_index(PyUnicodeObject *self, PyObject *args)
7184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007185 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007186 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007187 Py_ssize_t start;
7188 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189
Christian Heimes9cd17752007-11-18 19:35:23 +00007190 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
Thomas Wouters477c8d52006-05-27 19:21:47 +00007193 result = stringlib_find_slice(
7194 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7195 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7196 start, end
7197 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
7199 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 if (result < 0) {
7202 PyErr_SetString(PyExc_ValueError, "substring not found");
7203 return NULL;
7204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007205
Christian Heimes217cfd12007-12-02 14:31:20 +00007206 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007212Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007216unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217{
7218 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7219 register const Py_UNICODE *e;
7220 int cased;
7221
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 /* Shortcut for single character strings */
7223 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007226 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007227 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 e = p + PyUnicode_GET_SIZE(self);
7231 cased = 0;
7232 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007234
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7236 return PyBool_FromLong(0);
7237 else if (!cased && Py_UNICODE_ISLOWER(ch))
7238 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007240 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241}
7242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007243PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007246Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007247at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007250unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
7252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7253 register const Py_UNICODE *e;
7254 int cased;
7255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* Shortcut for single character strings */
7257 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007260 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007261 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007263
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 e = p + PyUnicode_GET_SIZE(self);
7265 cased = 0;
7266 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7270 return PyBool_FromLong(0);
7271 else if (!cased && Py_UNICODE_ISUPPER(ch))
7272 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007274 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007280Return True if S is a titlecased string and there is at least one\n\
7281character in S, i.e. upper- and titlecase characters may only\n\
7282follow uncased characters and lowercase characters only cased ones.\n\
7283Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007286unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7289 register const Py_UNICODE *e;
7290 int cased, previous_is_cased;
7291
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 /* Shortcut for single character strings */
7293 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7295 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007297 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007298 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 e = p + PyUnicode_GET_SIZE(self);
7302 cased = 0;
7303 previous_is_cased = 0;
7304 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007306
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7308 if (previous_is_cased)
7309 return PyBool_FromLong(0);
7310 previous_is_cased = 1;
7311 cased = 1;
7312 }
7313 else if (Py_UNICODE_ISLOWER(ch)) {
7314 if (!previous_is_cased)
7315 return PyBool_FromLong(0);
7316 previous_is_cased = 1;
7317 cased = 1;
7318 }
7319 else
7320 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007322 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323}
7324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007328Return True if all characters in S are whitespace\n\
7329and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007332unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
7334 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7335 register const Py_UNICODE *e;
7336
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 /* Shortcut for single character strings */
7338 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 Py_UNICODE_ISSPACE(*p))
7340 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007342 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007343 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007345
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 e = p + PyUnicode_GET_SIZE(self);
7347 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 if (!Py_UNICODE_ISSPACE(*p))
7349 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007351 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352}
7353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007356\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007357Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007358and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007359
7360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007361unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362{
7363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7364 register const Py_UNICODE *e;
7365
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366 /* Shortcut for single character strings */
7367 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 Py_UNICODE_ISALPHA(*p))
7369 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007370
7371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007372 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007374
7375 e = p + PyUnicode_GET_SIZE(self);
7376 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 if (!Py_UNICODE_ISALPHA(*p))
7378 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007379 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007381}
7382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007385\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007386Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007388
7389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007390unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391{
7392 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7393 register const Py_UNICODE *e;
7394
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395 /* Shortcut for single character strings */
7396 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 Py_UNICODE_ISALNUM(*p))
7398 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007399
7400 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007401 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007403
7404 e = p + PyUnicode_GET_SIZE(self);
7405 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (!Py_UNICODE_ISALNUM(*p))
7407 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007409 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007410}
7411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007415Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007419unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420{
7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422 register const Py_UNICODE *e;
7423
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 /* Shortcut for single character strings */
7425 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 Py_UNICODE_ISDECIMAL(*p))
7427 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007429 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007430 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007432
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 e = p + PyUnicode_GET_SIZE(self);
7434 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (!Py_UNICODE_ISDECIMAL(*p))
7436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007444Return True if all characters in S are digits\n\
7445and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007448unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7451 register const Py_UNICODE *e;
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Shortcut for single character strings */
7454 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_UNICODE_ISDIGIT(*p))
7456 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007459 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 e = p + PyUnicode_GET_SIZE(self);
7463 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (!Py_UNICODE_ISDIGIT(*p))
7465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007473Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007477unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
7479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7480 register const Py_UNICODE *e;
7481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 /* Shortcut for single character strings */
7483 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 Py_UNICODE_ISNUMERIC(*p))
7485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007488 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 e = p + PyUnicode_GET_SIZE(self);
7492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 if (!Py_UNICODE_ISNUMERIC(*p))
7494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497}
7498
Martin v. Löwis47383402007-08-15 07:32:56 +00007499int
7500PyUnicode_IsIdentifier(PyObject *self)
7501{
7502 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7503 register const Py_UNICODE *e;
7504
7505 /* Special case for empty strings */
7506 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007508
7509 /* PEP 3131 says that the first character must be in
7510 XID_Start and subsequent characters in XID_Continue,
7511 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007512 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007513 letters, digits, underscore). However, given the current
7514 definition of XID_Start and XID_Continue, it is sufficient
7515 to check just for these, except that _ must be allowed
7516 as starting an identifier. */
7517 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7518 return 0;
7519
7520 e = p + PyUnicode_GET_SIZE(self);
7521 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (!_PyUnicode_IsXidContinue(*p))
7523 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007524 }
7525 return 1;
7526}
7527
7528PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007530\n\
7531Return True if S is a valid identifier according\n\
7532to the language definition.");
7533
7534static PyObject*
7535unicode_isidentifier(PyObject *self)
7536{
7537 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7538}
7539
Georg Brandl559e5d72008-06-11 18:37:52 +00007540PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007542\n\
7543Return True if all characters in S are considered\n\
7544printable in repr() or S is empty, False otherwise.");
7545
7546static PyObject*
7547unicode_isprintable(PyObject *self)
7548{
7549 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7550 register const Py_UNICODE *e;
7551
7552 /* Shortcut for single character strings */
7553 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7554 Py_RETURN_TRUE;
7555 }
7556
7557 e = p + PyUnicode_GET_SIZE(self);
7558 for (; p < e; p++) {
7559 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7560 Py_RETURN_FALSE;
7561 }
7562 }
7563 Py_RETURN_TRUE;
7564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007567 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
7569Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007570iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007573unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007575 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576}
7577
Martin v. Löwis18e16552006-02-15 17:27:45 +00007578static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579unicode_length(PyUnicodeObject *self)
7580{
7581 return self->length;
7582}
7583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007584PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007587Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007588done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590static PyObject *
7591unicode_ljust(PyUnicodeObject *self, PyObject *args)
7592{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007593 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007594 Py_UNICODE fillchar = ' ';
7595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007596 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return NULL;
7598
Tim Peters7a29bd52001-09-12 03:03:31 +00007599 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 Py_INCREF(self);
7601 return (PyObject*) self;
7602 }
7603
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007604 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007610Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
7612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007613unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 return fixup(self, fixlower);
7616}
7617
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007618#define LEFTSTRIP 0
7619#define RIGHTSTRIP 1
7620#define BOTHSTRIP 2
7621
7622/* Arrays indexed by above */
7623static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7624
7625#define STRIPNAME(i) (stripformat[i]+3)
7626
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007627/* externally visible for str.strip(unicode) */
7628PyObject *
7629_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7632 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7633 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7634 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7635 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007636
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007638
Benjamin Peterson14339b62009-01-31 16:36:08 +00007639 i = 0;
7640 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7642 i++;
7643 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007645
Benjamin Peterson14339b62009-01-31 16:36:08 +00007646 j = len;
7647 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 do {
7649 j--;
7650 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7651 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007652 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007653
Benjamin Peterson14339b62009-01-31 16:36:08 +00007654 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 Py_INCREF(self);
7656 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007657 }
7658 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007660}
7661
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7667 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007668
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 i = 0;
7670 if (striptype != RIGHTSTRIP) {
7671 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7672 i++;
7673 }
7674 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007675
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 j = len;
7677 if (striptype != LEFTSTRIP) {
7678 do {
7679 j--;
7680 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7681 j++;
7682 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7685 Py_INCREF(self);
7686 return (PyObject*)self;
7687 }
7688 else
7689 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690}
7691
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007692
7693static PyObject *
7694do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7695{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697
Benjamin Peterson14339b62009-01-31 16:36:08 +00007698 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7699 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007700
Benjamin Peterson14339b62009-01-31 16:36:08 +00007701 if (sep != NULL && sep != Py_None) {
7702 if (PyUnicode_Check(sep))
7703 return _PyUnicode_XStrip(self, striptype, sep);
7704 else {
7705 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 "%s arg must be None or str",
7707 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007708 return NULL;
7709 }
7710 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007711
Benjamin Peterson14339b62009-01-31 16:36:08 +00007712 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713}
7714
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007718\n\
7719Return a copy of the string S with leading and trailing\n\
7720whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007721If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007722
7723static PyObject *
7724unicode_strip(PyUnicodeObject *self, PyObject *args)
7725{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726 if (PyTuple_GET_SIZE(args) == 0)
7727 return do_strip(self, BOTHSTRIP); /* Common case */
7728 else
7729 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007730}
7731
7732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007733PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007735\n\
7736Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007737If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007738
7739static PyObject *
7740unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 if (PyTuple_GET_SIZE(args) == 0)
7743 return do_strip(self, LEFTSTRIP); /* Common case */
7744 else
7745 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007746}
7747
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751\n\
7752Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007753If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007754
7755static PyObject *
7756unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7757{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007758 if (PyTuple_GET_SIZE(args) == 0)
7759 return do_strip(self, RIGHTSTRIP); /* Common case */
7760 else
7761 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007762}
7763
7764
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007766unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767{
7768 PyUnicodeObject *u;
7769 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007770 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007771 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
Georg Brandl222de0f2009-04-12 12:01:50 +00007773 if (len < 1) {
7774 Py_INCREF(unicode_empty);
7775 return (PyObject *)unicode_empty;
7776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
Tim Peters7a29bd52001-09-12 03:03:31 +00007778 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 /* no repeat, return original string */
7780 Py_INCREF(str);
7781 return (PyObject*) str;
7782 }
Tim Peters8f422462000-09-09 06:13:41 +00007783
7784 /* ensure # of chars needed doesn't overflow int and # of bytes
7785 * needed doesn't overflow size_t
7786 */
7787 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007788 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007789 PyErr_SetString(PyExc_OverflowError,
7790 "repeated string is too long");
7791 return NULL;
7792 }
7793 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7794 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7795 PyErr_SetString(PyExc_OverflowError,
7796 "repeated string is too long");
7797 return NULL;
7798 }
7799 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 if (!u)
7801 return NULL;
7802
7803 p = u->str;
7804
Georg Brandl222de0f2009-04-12 12:01:50 +00007805 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007806 Py_UNICODE_FILL(p, str->str[0], len);
7807 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007808 Py_ssize_t done = str->length; /* number of characters copied this far */
7809 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007811 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007812 Py_UNICODE_COPY(p+done, p, n);
7813 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 }
7816
7817 return (PyObject*) u;
7818}
7819
7820PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 PyObject *subobj,
7822 PyObject *replobj,
7823 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824{
7825 PyObject *self;
7826 PyObject *str1;
7827 PyObject *str2;
7828 PyObject *result;
7829
7830 self = PyUnicode_FromObject(obj);
7831 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 str1 = PyUnicode_FromObject(subobj);
7834 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 Py_DECREF(self);
7836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 }
7838 str2 = PyUnicode_FromObject(replobj);
7839 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 Py_DECREF(self);
7841 Py_DECREF(str1);
7842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 }
Tim Petersced69f82003-09-16 20:30:58 +00007844 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 (PyUnicodeObject *)str1,
7846 (PyUnicodeObject *)str2,
7847 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 Py_DECREF(self);
7849 Py_DECREF(str1);
7850 Py_DECREF(str2);
7851 return result;
7852}
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856\n\
7857Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007858old replaced by new. If the optional argument count is\n\
7859given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
7861static PyObject*
7862unicode_replace(PyUnicodeObject *self, PyObject *args)
7863{
7864 PyUnicodeObject *str1;
7865 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007866 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 PyObject *result;
7868
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 return NULL;
7871 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7872 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007875 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 Py_DECREF(str1);
7877 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880 result = replace(self, str1, str2, maxcount);
7881
7882 Py_DECREF(str1);
7883 Py_DECREF(str2);
7884 return result;
7885}
7886
7887static
7888PyObject *unicode_repr(PyObject *unicode)
7889{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007890 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007891 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007892 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7893 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7894
7895 /* XXX(nnorwitz): rather than over-allocating, it would be
7896 better to choose a different scheme. Perhaps scan the
7897 first N-chars of the string and allocate based on that size.
7898 */
7899 /* Initial allocation is based on the longest-possible unichr
7900 escape.
7901
7902 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7903 unichr, so in this case it's the longest unichr escape. In
7904 narrow (UTF-16) builds this is five chars per source unichr
7905 since there are two unichrs in the surrogate pair, so in narrow
7906 (UTF-16) builds it's not the longest unichr escape.
7907
7908 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7909 so in the narrow (UTF-16) build case it's the longest unichr
7910 escape.
7911 */
7912
Walter Dörwald1ab83302007-05-18 17:15:44 +00007913 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007915#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007917#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007919#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007921 if (repr == NULL)
7922 return NULL;
7923
Walter Dörwald1ab83302007-05-18 17:15:44 +00007924 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007925
7926 /* Add quote */
7927 *p++ = (findchar(s, size, '\'') &&
7928 !findchar(s, size, '"')) ? '"' : '\'';
7929 while (size-- > 0) {
7930 Py_UNICODE ch = *s++;
7931
7932 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007933 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007934 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007935 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007936 continue;
7937 }
7938
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007940 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007941 *p++ = '\\';
7942 *p++ = 't';
7943 }
7944 else if (ch == '\n') {
7945 *p++ = '\\';
7946 *p++ = 'n';
7947 }
7948 else if (ch == '\r') {
7949 *p++ = '\\';
7950 *p++ = 'r';
7951 }
7952
7953 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007954 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007955 *p++ = '\\';
7956 *p++ = 'x';
7957 *p++ = hexdigits[(ch >> 4) & 0x000F];
7958 *p++ = hexdigits[ch & 0x000F];
7959 }
7960
Georg Brandl559e5d72008-06-11 18:37:52 +00007961 /* Copy ASCII characters as-is */
7962 else if (ch < 0x7F) {
7963 *p++ = ch;
7964 }
7965
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007967 else {
7968 Py_UCS4 ucs = ch;
7969
7970#ifndef Py_UNICODE_WIDE
7971 Py_UNICODE ch2 = 0;
7972 /* Get code point from surrogate pair */
7973 if (size > 0) {
7974 ch2 = *s;
7975 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007980 size--;
7981 }
7982 }
7983#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007984 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007985 (categories Z* and C* except ASCII space)
7986 */
7987 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7988 /* Map 8-bit characters to '\xhh' */
7989 if (ucs <= 0xff) {
7990 *p++ = '\\';
7991 *p++ = 'x';
7992 *p++ = hexdigits[(ch >> 4) & 0x000F];
7993 *p++ = hexdigits[ch & 0x000F];
7994 }
7995 /* Map 21-bit characters to '\U00xxxxxx' */
7996 else if (ucs >= 0x10000) {
7997 *p++ = '\\';
7998 *p++ = 'U';
7999 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8000 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8001 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8002 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8003 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8004 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8005 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8006 *p++ = hexdigits[ucs & 0x0000000F];
8007 }
8008 /* Map 16-bit characters to '\uxxxx' */
8009 else {
8010 *p++ = '\\';
8011 *p++ = 'u';
8012 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8013 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8014 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8015 *p++ = hexdigits[ucs & 0x000F];
8016 }
8017 }
8018 /* Copy characters as-is */
8019 else {
8020 *p++ = ch;
8021#ifndef Py_UNICODE_WIDE
8022 if (ucs >= 0x10000)
8023 *p++ = ch2;
8024#endif
8025 }
8026 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008027 }
8028 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008029 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008030
8031 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008032 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008033 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034}
8035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008036PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038\n\
8039Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008040such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041arguments start and end are interpreted as in slice notation.\n\
8042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008043Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
8045static PyObject *
8046unicode_rfind(PyUnicodeObject *self, PyObject *args)
8047{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008049 Py_ssize_t start;
8050 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008051 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052
Christian Heimes9cd17752007-11-18 19:35:23 +00008053 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
Thomas Wouters477c8d52006-05-27 19:21:47 +00008056 result = stringlib_rfind_slice(
8057 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8058 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8059 start, end
8060 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061
8062 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008063
Christian Heimes217cfd12007-12-02 14:31:20 +00008064 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065}
8066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008067PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008070Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072static PyObject *
8073unicode_rindex(PyUnicodeObject *self, PyObject *args)
8074{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008076 Py_ssize_t start;
8077 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008078 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079
Christian Heimes9cd17752007-11-18 19:35:23 +00008080 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
Thomas Wouters477c8d52006-05-27 19:21:47 +00008083 result = stringlib_rfind_slice(
8084 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8085 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8086 start, end
8087 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088
8089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (result < 0) {
8092 PyErr_SetString(PyExc_ValueError, "substring not found");
8093 return NULL;
8094 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008098PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008101Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008102done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static PyObject *
8105unicode_rjust(PyUnicodeObject *self, PyObject *args)
8106{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008107 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008108 Py_UNICODE fillchar = ' ';
8109
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008110 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 return NULL;
8112
Tim Peters7a29bd52001-09-12 03:03:31 +00008113 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 Py_INCREF(self);
8115 return (PyObject*) self;
8116 }
8117
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008118 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119}
8120
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 PyObject *sep,
8123 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124{
8125 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008126
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 s = PyUnicode_FromObject(s);
8128 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 if (sep != NULL) {
8131 sep = PyUnicode_FromObject(sep);
8132 if (sep == NULL) {
8133 Py_DECREF(s);
8134 return NULL;
8135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 }
8137
8138 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8139
8140 Py_DECREF(s);
8141 Py_XDECREF(sep);
8142 return result;
8143}
8144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008145PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147\n\
8148Return a list of the words in S, using sep as the\n\
8149delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008150splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008151whitespace string is a separator and empty strings are\n\
8152removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153
8154static PyObject*
8155unicode_split(PyUnicodeObject *self, PyObject *args)
8156{
8157 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162
8163 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169}
8170
Thomas Wouters477c8d52006-05-27 19:21:47 +00008171PyObject *
8172PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8173{
8174 PyObject* str_obj;
8175 PyObject* sep_obj;
8176 PyObject* out;
8177
8178 str_obj = PyUnicode_FromObject(str_in);
8179 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008181 sep_obj = PyUnicode_FromObject(sep_in);
8182 if (!sep_obj) {
8183 Py_DECREF(str_obj);
8184 return NULL;
8185 }
8186
8187 out = stringlib_partition(
8188 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8189 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8190 );
8191
8192 Py_DECREF(sep_obj);
8193 Py_DECREF(str_obj);
8194
8195 return out;
8196}
8197
8198
8199PyObject *
8200PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8201{
8202 PyObject* str_obj;
8203 PyObject* sep_obj;
8204 PyObject* out;
8205
8206 str_obj = PyUnicode_FromObject(str_in);
8207 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209 sep_obj = PyUnicode_FromObject(sep_in);
8210 if (!sep_obj) {
8211 Py_DECREF(str_obj);
8212 return NULL;
8213 }
8214
8215 out = stringlib_rpartition(
8216 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8217 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8218 );
8219
8220 Py_DECREF(sep_obj);
8221 Py_DECREF(str_obj);
8222
8223 return out;
8224}
8225
8226PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008229Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008230the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008231found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232
8233static PyObject*
8234unicode_partition(PyUnicodeObject *self, PyObject *separator)
8235{
8236 return PyUnicode_Partition((PyObject *)self, separator);
8237}
8238
8239PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008240 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008242Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008244separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008245
8246static PyObject*
8247unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8248{
8249 return PyUnicode_RPartition((PyObject *)self, separator);
8250}
8251
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008252PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 PyObject *sep,
8254 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008255{
8256 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008258 s = PyUnicode_FromObject(s);
8259 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 if (sep != NULL) {
8262 sep = PyUnicode_FromObject(sep);
8263 if (sep == NULL) {
8264 Py_DECREF(s);
8265 return NULL;
8266 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008267 }
8268
8269 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8270
8271 Py_DECREF(s);
8272 Py_XDECREF(sep);
8273 return result;
8274}
8275
8276PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008278\n\
8279Return a list of the words in S, using sep as the\n\
8280delimiter string, starting at the end of the string and\n\
8281working to the front. If maxsplit is given, at most maxsplit\n\
8282splits are done. If sep is not specified, any whitespace string\n\
8283is a separator.");
8284
8285static PyObject*
8286unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8287{
8288 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008289 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008290
Martin v. Löwis18e16552006-02-15 17:27:45 +00008291 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008292 return NULL;
8293
8294 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008296 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008298 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008300}
8301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008302PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304\n\
8305Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008306Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008307is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
8309static PyObject*
8310unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8311{
Guido van Rossum86662912000-04-11 15:38:46 +00008312 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
Guido van Rossum86662912000-04-11 15:38:46 +00008314 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 return NULL;
8316
Guido van Rossum86662912000-04-11 15:38:46 +00008317 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318}
8319
8320static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008321PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Walter Dörwald346737f2007-05-31 10:44:43 +00008323 if (PyUnicode_CheckExact(self)) {
8324 Py_INCREF(self);
8325 return self;
8326 } else
8327 /* Subtype -- return genuine unicode string with the same value. */
8328 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8329 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008332PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334\n\
8335Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008336and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008339unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 return fixup(self, fixswapcase);
8342}
8343
Georg Brandlceee0772007-11-27 23:48:05 +00008344PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008346\n\
8347Return a translation table usable for str.translate().\n\
8348If there is only one argument, it must be a dictionary mapping Unicode\n\
8349ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008350Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008351If there are two arguments, they must be strings of equal length, and\n\
8352in the resulting dictionary, each character in x will be mapped to the\n\
8353character at the same position in y. If there is a third argument, it\n\
8354must be a string, whose characters will be mapped to None in the result.");
8355
8356static PyObject*
8357unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8358{
8359 PyObject *x, *y = NULL, *z = NULL;
8360 PyObject *new = NULL, *key, *value;
8361 Py_ssize_t i = 0;
8362 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363
Georg Brandlceee0772007-11-27 23:48:05 +00008364 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8365 return NULL;
8366 new = PyDict_New();
8367 if (!new)
8368 return NULL;
8369 if (y != NULL) {
8370 /* x must be a string too, of equal length */
8371 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8372 if (!PyUnicode_Check(x)) {
8373 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8374 "be a string if there is a second argument");
8375 goto err;
8376 }
8377 if (PyUnicode_GET_SIZE(x) != ylen) {
8378 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8379 "arguments must have equal length");
8380 goto err;
8381 }
8382 /* create entries for translating chars in x to those in y */
8383 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008384 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8385 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008386 if (!key || !value)
8387 goto err;
8388 res = PyDict_SetItem(new, key, value);
8389 Py_DECREF(key);
8390 Py_DECREF(value);
8391 if (res < 0)
8392 goto err;
8393 }
8394 /* create entries for deleting chars in z */
8395 if (z != NULL) {
8396 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008397 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008398 if (!key)
8399 goto err;
8400 res = PyDict_SetItem(new, key, Py_None);
8401 Py_DECREF(key);
8402 if (res < 0)
8403 goto err;
8404 }
8405 }
8406 } else {
8407 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008408 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008409 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8410 "to maketrans it must be a dict");
8411 goto err;
8412 }
8413 /* copy entries into the new dict, converting string keys to int keys */
8414 while (PyDict_Next(x, &i, &key, &value)) {
8415 if (PyUnicode_Check(key)) {
8416 /* convert string keys to integer keys */
8417 PyObject *newkey;
8418 if (PyUnicode_GET_SIZE(key) != 1) {
8419 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8420 "table must be of length 1");
8421 goto err;
8422 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008423 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008424 if (!newkey)
8425 goto err;
8426 res = PyDict_SetItem(new, newkey, value);
8427 Py_DECREF(newkey);
8428 if (res < 0)
8429 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008430 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008431 /* just keep integer keys */
8432 if (PyDict_SetItem(new, key, value) < 0)
8433 goto err;
8434 } else {
8435 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8436 "be strings or integers");
8437 goto err;
8438 }
8439 }
8440 }
8441 return new;
8442 err:
8443 Py_DECREF(new);
8444 return NULL;
8445}
8446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008447PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449\n\
8450Return a copy of the string S, where all characters have been mapped\n\
8451through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008452Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008453Unmapped characters are left untouched. Characters mapped to None\n\
8454are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
8456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008457unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458{
Georg Brandlceee0772007-11-27 23:48:05 +00008459 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460}
8461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008462PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008465Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
8467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008468unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 return fixup(self, fixupper);
8471}
8472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008473PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008476Pad a numeric string S with zeros on the left, to fill a field\n\
8477of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
8479static PyObject *
8480unicode_zfill(PyUnicodeObject *self, PyObject *args)
8481{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008482 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 PyUnicodeObject *u;
8484
Martin v. Löwis18e16552006-02-15 17:27:45 +00008485 Py_ssize_t width;
8486 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 return NULL;
8488
8489 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008490 if (PyUnicode_CheckExact(self)) {
8491 Py_INCREF(self);
8492 return (PyObject*) self;
8493 }
8494 else
8495 return PyUnicode_FromUnicode(
8496 PyUnicode_AS_UNICODE(self),
8497 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 }
8500
8501 fill = width - self->length;
8502
8503 u = pad(self, fill, 0, '0');
8504
Walter Dörwald068325e2002-04-15 13:36:47 +00008505 if (u == NULL)
8506 return NULL;
8507
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 if (u->str[fill] == '+' || u->str[fill] == '-') {
8509 /* move sign to beginning of string */
8510 u->str[0] = u->str[fill];
8511 u->str[fill] = '0';
8512 }
8513
8514 return (PyObject*) u;
8515}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
8517#if 0
8518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008519unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520{
Christian Heimes2202f872008-02-06 14:31:34 +00008521 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522}
8523#endif
8524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008525PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008528Return True if S starts with the specified prefix, False otherwise.\n\
8529With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008530With optional end, stop comparing S at that position.\n\
8531prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
8533static PyObject *
8534unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008537 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008540 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008541 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008546 if (PyTuple_Check(subobj)) {
8547 Py_ssize_t i;
8548 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 if (substring == NULL)
8552 return NULL;
8553 result = tailmatch(self, substring, start, end, -1);
8554 Py_DECREF(substring);
8555 if (result) {
8556 Py_RETURN_TRUE;
8557 }
8558 }
8559 /* nothing matched */
8560 Py_RETURN_FALSE;
8561 }
8562 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008565 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568}
8569
8570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008571PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008574Return True if S ends with the specified suffix, False otherwise.\n\
8575With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008576With optional end, stop comparing S at that position.\n\
8577suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
8579static PyObject *
8580unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008583 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008585 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008586 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008587 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008589 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8591 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008592 if (PyTuple_Check(subobj)) {
8593 Py_ssize_t i;
8594 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8595 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008597 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008599 result = tailmatch(self, substring, start, end, +1);
8600 Py_DECREF(substring);
8601 if (result) {
8602 Py_RETURN_TRUE;
8603 }
8604 }
8605 Py_RETURN_FALSE;
8606 }
8607 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008611 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008613 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614}
8615
Eric Smith8c663262007-08-25 02:26:07 +00008616#include "stringlib/string_format.h"
8617
8618PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008620\n\
8621");
8622
Eric Smith4a7d76d2008-05-30 18:10:19 +00008623static PyObject *
8624unicode__format__(PyObject* self, PyObject* args)
8625{
8626 PyObject *format_spec;
8627
8628 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8629 return NULL;
8630
8631 return _PyUnicode_FormatAdvanced(self,
8632 PyUnicode_AS_UNICODE(format_spec),
8633 PyUnicode_GET_SIZE(format_spec));
8634}
8635
Eric Smith8c663262007-08-25 02:26:07 +00008636PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008638\n\
8639");
8640
8641static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008642unicode__sizeof__(PyUnicodeObject *v)
8643{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008644 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8645 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008646}
8647
8648PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008650
8651static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008652unicode_getnewargs(PyUnicodeObject *v)
8653{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008655}
8656
8657
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658static PyMethodDef unicode_methods[] = {
8659
8660 /* Order is according to common usage: often used methods should
8661 appear first, since lookup is done sequentially. */
8662
Benjamin Peterson308d6372009-09-18 21:42:35 +00008663 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008664 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8665 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008666 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008667 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8668 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8669 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8670 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8671 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8672 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8673 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008674 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008675 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8676 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8677 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008678 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008679 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8680 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8681 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008682 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008683 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008684 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008685 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008686 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8687 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8688 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8689 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8690 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8691 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8692 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8693 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8694 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8695 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8696 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8697 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8698 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8699 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008700 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008701 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008702 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008703 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008704 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008705 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8706 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008707 {"maketrans", (PyCFunction) unicode_maketrans,
8708 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008709 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008710#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008711 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712#endif
8713
8714#if 0
8715 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008716 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717#endif
8718
Benjamin Peterson14339b62009-01-31 16:36:08 +00008719 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 {NULL, NULL}
8721};
8722
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008723static PyObject *
8724unicode_mod(PyObject *v, PyObject *w)
8725{
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (!PyUnicode_Check(v)) {
8727 Py_INCREF(Py_NotImplemented);
8728 return Py_NotImplemented;
8729 }
8730 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008731}
8732
8733static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734 0, /*nb_add*/
8735 0, /*nb_subtract*/
8736 0, /*nb_multiply*/
8737 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008738};
8739
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741 (lenfunc) unicode_length, /* sq_length */
8742 PyUnicode_Concat, /* sq_concat */
8743 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8744 (ssizeargfunc) unicode_getitem, /* sq_item */
8745 0, /* sq_slice */
8746 0, /* sq_ass_item */
8747 0, /* sq_ass_slice */
8748 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749};
8750
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008751static PyObject*
8752unicode_subscript(PyUnicodeObject* self, PyObject* item)
8753{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008754 if (PyIndex_Check(item)) {
8755 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008756 if (i == -1 && PyErr_Occurred())
8757 return NULL;
8758 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008759 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008760 return unicode_getitem(self, i);
8761 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008762 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008763 Py_UNICODE* source_buf;
8764 Py_UNICODE* result_buf;
8765 PyObject* result;
8766
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008767 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008769 return NULL;
8770 }
8771
8772 if (slicelength <= 0) {
8773 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008774 } else if (start == 0 && step == 1 && slicelength == self->length &&
8775 PyUnicode_CheckExact(self)) {
8776 Py_INCREF(self);
8777 return (PyObject *)self;
8778 } else if (step == 1) {
8779 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008780 } else {
8781 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008782 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8783 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008784
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 if (result_buf == NULL)
8786 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008787
8788 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8789 result_buf[i] = source_buf[cur];
8790 }
Tim Petersced69f82003-09-16 20:30:58 +00008791
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008792 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008793 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008794 return result;
8795 }
8796 } else {
8797 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8798 return NULL;
8799 }
8800}
8801
8802static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008803 (lenfunc)unicode_length, /* mp_length */
8804 (binaryfunc)unicode_subscript, /* mp_subscript */
8805 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008806};
8807
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809/* Helpers for PyUnicode_Format() */
8810
8811static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008812getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008814 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 (*p_argidx)++;
8817 if (arglen < 0)
8818 return args;
8819 else
8820 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 }
8822 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 return NULL;
8825}
8826
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008827/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008829static PyObject *
8830formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008832 char *p;
8833 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008835
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 x = PyFloat_AsDouble(v);
8837 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008838 return NULL;
8839
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008842
Eric Smith0923d1d2009-04-16 20:16:10 +00008843 p = PyOS_double_to_string(x, type, prec,
8844 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008845 if (p == NULL)
8846 return NULL;
8847 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008848 PyMem_Free(p);
8849 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850}
8851
Tim Peters38fd5b62000-09-21 05:43:11 +00008852static PyObject*
8853formatlong(PyObject *val, int flags, int prec, int type)
8854{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008855 char *buf;
8856 int len;
8857 PyObject *str; /* temporary string object. */
8858 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008859
Benjamin Peterson14339b62009-01-31 16:36:08 +00008860 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8861 if (!str)
8862 return NULL;
8863 result = PyUnicode_FromStringAndSize(buf, len);
8864 Py_DECREF(str);
8865 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008866}
8867
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868static int
8869formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008870 size_t buflen,
8871 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008873 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008874 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 if (PyUnicode_GET_SIZE(v) == 1) {
8876 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8877 buf[1] = '\0';
8878 return 1;
8879 }
8880#ifndef Py_UNICODE_WIDE
8881 if (PyUnicode_GET_SIZE(v) == 2) {
8882 /* Decode a valid surrogate pair */
8883 int c0 = PyUnicode_AS_UNICODE(v)[0];
8884 int c1 = PyUnicode_AS_UNICODE(v)[1];
8885 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8886 0xDC00 <= c1 && c1 <= 0xDFFF) {
8887 buf[0] = c0;
8888 buf[1] = c1;
8889 buf[2] = '\0';
8890 return 2;
8891 }
8892 }
8893#endif
8894 goto onError;
8895 }
8896 else {
8897 /* Integer input truncated to a character */
8898 long x;
8899 x = PyLong_AsLong(v);
8900 if (x == -1 && PyErr_Occurred())
8901 goto onError;
8902
8903 if (x < 0 || x > 0x10ffff) {
8904 PyErr_SetString(PyExc_OverflowError,
8905 "%c arg not in range(0x110000)");
8906 return -1;
8907 }
8908
8909#ifndef Py_UNICODE_WIDE
8910 if (x > 0xffff) {
8911 x -= 0x10000;
8912 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8913 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8914 return 2;
8915 }
8916#endif
8917 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 buf[1] = '\0';
8919 return 1;
8920 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008921
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008923 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008925 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926}
8927
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008928/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008929 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008930*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008931#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935{
8936 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 int args_owned = 0;
8939 PyUnicodeObject *result = NULL;
8940 PyObject *dict = NULL;
8941 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 PyErr_BadInternalCall();
8945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
8947 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008948 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 fmt = PyUnicode_AS_UNICODE(uformat);
8951 fmtcnt = PyUnicode_GET_SIZE(uformat);
8952
8953 reslen = rescnt = fmtcnt + 100;
8954 result = _PyUnicode_New(reslen);
8955 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 res = PyUnicode_AS_UNICODE(result);
8958
8959 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 arglen = PyTuple_Size(args);
8961 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 arglen = -1;
8965 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008967 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008968 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970
8971 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (*fmt != '%') {
8973 if (--rescnt < 0) {
8974 rescnt = fmtcnt + 100;
8975 reslen += rescnt;
8976 if (_PyUnicode_Resize(&result, reslen) < 0)
8977 goto onError;
8978 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8979 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008982 }
8983 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 /* Got a format specifier */
8985 int flags = 0;
8986 Py_ssize_t width = -1;
8987 int prec = -1;
8988 Py_UNICODE c = '\0';
8989 Py_UNICODE fill;
8990 int isnumok;
8991 PyObject *v = NULL;
8992 PyObject *temp = NULL;
8993 Py_UNICODE *pbuf;
8994 Py_UNICODE sign;
8995 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008996 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 fmt++;
8999 if (*fmt == '(') {
9000 Py_UNICODE *keystart;
9001 Py_ssize_t keylen;
9002 PyObject *key;
9003 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009004
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 if (dict == NULL) {
9006 PyErr_SetString(PyExc_TypeError,
9007 "format requires a mapping");
9008 goto onError;
9009 }
9010 ++fmt;
9011 --fmtcnt;
9012 keystart = fmt;
9013 /* Skip over balanced parentheses */
9014 while (pcount > 0 && --fmtcnt >= 0) {
9015 if (*fmt == ')')
9016 --pcount;
9017 else if (*fmt == '(')
9018 ++pcount;
9019 fmt++;
9020 }
9021 keylen = fmt - keystart - 1;
9022 if (fmtcnt < 0 || pcount > 0) {
9023 PyErr_SetString(PyExc_ValueError,
9024 "incomplete format key");
9025 goto onError;
9026 }
9027#if 0
9028 /* keys are converted to strings using UTF-8 and
9029 then looked up since Python uses strings to hold
9030 variables names etc. in its namespaces and we
9031 wouldn't want to break common idioms. */
9032 key = PyUnicode_EncodeUTF8(keystart,
9033 keylen,
9034 NULL);
9035#else
9036 key = PyUnicode_FromUnicode(keystart, keylen);
9037#endif
9038 if (key == NULL)
9039 goto onError;
9040 if (args_owned) {
9041 Py_DECREF(args);
9042 args_owned = 0;
9043 }
9044 args = PyObject_GetItem(dict, key);
9045 Py_DECREF(key);
9046 if (args == NULL) {
9047 goto onError;
9048 }
9049 args_owned = 1;
9050 arglen = -1;
9051 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009052 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 while (--fmtcnt >= 0) {
9054 switch (c = *fmt++) {
9055 case '-': flags |= F_LJUST; continue;
9056 case '+': flags |= F_SIGN; continue;
9057 case ' ': flags |= F_BLANK; continue;
9058 case '#': flags |= F_ALT; continue;
9059 case '0': flags |= F_ZERO; continue;
9060 }
9061 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 if (c == '*') {
9064 v = getnextarg(args, arglen, &argidx);
9065 if (v == NULL)
9066 goto onError;
9067 if (!PyLong_Check(v)) {
9068 PyErr_SetString(PyExc_TypeError,
9069 "* wants int");
9070 goto onError;
9071 }
9072 width = PyLong_AsLong(v);
9073 if (width == -1 && PyErr_Occurred())
9074 goto onError;
9075 if (width < 0) {
9076 flags |= F_LJUST;
9077 width = -width;
9078 }
9079 if (--fmtcnt >= 0)
9080 c = *fmt++;
9081 }
9082 else if (c >= '0' && c <= '9') {
9083 width = c - '0';
9084 while (--fmtcnt >= 0) {
9085 c = *fmt++;
9086 if (c < '0' || c > '9')
9087 break;
9088 if ((width*10) / 10 != width) {
9089 PyErr_SetString(PyExc_ValueError,
9090 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 }
9093 width = width*10 + (c - '0');
9094 }
9095 }
9096 if (c == '.') {
9097 prec = 0;
9098 if (--fmtcnt >= 0)
9099 c = *fmt++;
9100 if (c == '*') {
9101 v = getnextarg(args, arglen, &argidx);
9102 if (v == NULL)
9103 goto onError;
9104 if (!PyLong_Check(v)) {
9105 PyErr_SetString(PyExc_TypeError,
9106 "* wants int");
9107 goto onError;
9108 }
9109 prec = PyLong_AsLong(v);
9110 if (prec == -1 && PyErr_Occurred())
9111 goto onError;
9112 if (prec < 0)
9113 prec = 0;
9114 if (--fmtcnt >= 0)
9115 c = *fmt++;
9116 }
9117 else if (c >= '0' && c <= '9') {
9118 prec = c - '0';
9119 while (--fmtcnt >= 0) {
9120 c = Py_CHARMASK(*fmt++);
9121 if (c < '0' || c > '9')
9122 break;
9123 if ((prec*10) / 10 != prec) {
9124 PyErr_SetString(PyExc_ValueError,
9125 "prec too big");
9126 goto onError;
9127 }
9128 prec = prec*10 + (c - '0');
9129 }
9130 }
9131 } /* prec */
9132 if (fmtcnt >= 0) {
9133 if (c == 'h' || c == 'l' || c == 'L') {
9134 if (--fmtcnt >= 0)
9135 c = *fmt++;
9136 }
9137 }
9138 if (fmtcnt < 0) {
9139 PyErr_SetString(PyExc_ValueError,
9140 "incomplete format");
9141 goto onError;
9142 }
9143 if (c != '%') {
9144 v = getnextarg(args, arglen, &argidx);
9145 if (v == NULL)
9146 goto onError;
9147 }
9148 sign = 0;
9149 fill = ' ';
9150 switch (c) {
9151
9152 case '%':
9153 pbuf = formatbuf;
9154 /* presume that buffer length is at least 1 */
9155 pbuf[0] = '%';
9156 len = 1;
9157 break;
9158
9159 case 's':
9160 case 'r':
9161 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009162 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 temp = v;
9164 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009165 }
9166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 if (c == 's')
9168 temp = PyObject_Str(v);
9169 else if (c == 'r')
9170 temp = PyObject_Repr(v);
9171 else
9172 temp = PyObject_ASCII(v);
9173 if (temp == NULL)
9174 goto onError;
9175 if (PyUnicode_Check(temp))
9176 /* nothing to do */;
9177 else {
9178 Py_DECREF(temp);
9179 PyErr_SetString(PyExc_TypeError,
9180 "%s argument has non-string str()");
9181 goto onError;
9182 }
9183 }
9184 pbuf = PyUnicode_AS_UNICODE(temp);
9185 len = PyUnicode_GET_SIZE(temp);
9186 if (prec >= 0 && len > prec)
9187 len = prec;
9188 break;
9189
9190 case 'i':
9191 case 'd':
9192 case 'u':
9193 case 'o':
9194 case 'x':
9195 case 'X':
9196 if (c == 'i')
9197 c = 'd';
9198 isnumok = 0;
9199 if (PyNumber_Check(v)) {
9200 PyObject *iobj=NULL;
9201
9202 if (PyLong_Check(v)) {
9203 iobj = v;
9204 Py_INCREF(iobj);
9205 }
9206 else {
9207 iobj = PyNumber_Long(v);
9208 }
9209 if (iobj!=NULL) {
9210 if (PyLong_Check(iobj)) {
9211 isnumok = 1;
9212 temp = formatlong(iobj, flags, prec, c);
9213 Py_DECREF(iobj);
9214 if (!temp)
9215 goto onError;
9216 pbuf = PyUnicode_AS_UNICODE(temp);
9217 len = PyUnicode_GET_SIZE(temp);
9218 sign = 1;
9219 }
9220 else {
9221 Py_DECREF(iobj);
9222 }
9223 }
9224 }
9225 if (!isnumok) {
9226 PyErr_Format(PyExc_TypeError,
9227 "%%%c format: a number is required, "
9228 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9229 goto onError;
9230 }
9231 if (flags & F_ZERO)
9232 fill = '0';
9233 break;
9234
9235 case 'e':
9236 case 'E':
9237 case 'f':
9238 case 'F':
9239 case 'g':
9240 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009241 temp = formatfloat(v, flags, prec, c);
9242 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009244 pbuf = PyUnicode_AS_UNICODE(temp);
9245 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 sign = 1;
9247 if (flags & F_ZERO)
9248 fill = '0';
9249 break;
9250
9251 case 'c':
9252 pbuf = formatbuf;
9253 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9254 if (len < 0)
9255 goto onError;
9256 break;
9257
9258 default:
9259 PyErr_Format(PyExc_ValueError,
9260 "unsupported format character '%c' (0x%x) "
9261 "at index %zd",
9262 (31<=c && c<=126) ? (char)c : '?',
9263 (int)c,
9264 (Py_ssize_t)(fmt - 1 -
9265 PyUnicode_AS_UNICODE(uformat)));
9266 goto onError;
9267 }
9268 if (sign) {
9269 if (*pbuf == '-' || *pbuf == '+') {
9270 sign = *pbuf++;
9271 len--;
9272 }
9273 else if (flags & F_SIGN)
9274 sign = '+';
9275 else if (flags & F_BLANK)
9276 sign = ' ';
9277 else
9278 sign = 0;
9279 }
9280 if (width < len)
9281 width = len;
9282 if (rescnt - (sign != 0) < width) {
9283 reslen -= rescnt;
9284 rescnt = width + fmtcnt + 100;
9285 reslen += rescnt;
9286 if (reslen < 0) {
9287 Py_XDECREF(temp);
9288 PyErr_NoMemory();
9289 goto onError;
9290 }
9291 if (_PyUnicode_Resize(&result, reslen) < 0) {
9292 Py_XDECREF(temp);
9293 goto onError;
9294 }
9295 res = PyUnicode_AS_UNICODE(result)
9296 + reslen - rescnt;
9297 }
9298 if (sign) {
9299 if (fill != ' ')
9300 *res++ = sign;
9301 rescnt--;
9302 if (width > len)
9303 width--;
9304 }
9305 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9306 assert(pbuf[0] == '0');
9307 assert(pbuf[1] == c);
9308 if (fill != ' ') {
9309 *res++ = *pbuf++;
9310 *res++ = *pbuf++;
9311 }
9312 rescnt -= 2;
9313 width -= 2;
9314 if (width < 0)
9315 width = 0;
9316 len -= 2;
9317 }
9318 if (width > len && !(flags & F_LJUST)) {
9319 do {
9320 --rescnt;
9321 *res++ = fill;
9322 } while (--width > len);
9323 }
9324 if (fill == ' ') {
9325 if (sign)
9326 *res++ = sign;
9327 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9328 assert(pbuf[0] == '0');
9329 assert(pbuf[1] == c);
9330 *res++ = *pbuf++;
9331 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009332 }
9333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 Py_UNICODE_COPY(res, pbuf, len);
9335 res += len;
9336 rescnt -= len;
9337 while (--width >= len) {
9338 --rescnt;
9339 *res++ = ' ';
9340 }
9341 if (dict && (argidx < arglen) && c != '%') {
9342 PyErr_SetString(PyExc_TypeError,
9343 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009344 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 goto onError;
9346 }
9347 Py_XDECREF(temp);
9348 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 } /* until end */
9350 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 PyErr_SetString(PyExc_TypeError,
9352 "not all arguments converted during string formatting");
9353 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 }
9355
Thomas Woutersa96affe2006-03-12 00:29:36 +00009356 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 }
9361 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 return (PyObject *)result;
9363
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 Py_XDECREF(result);
9366 Py_DECREF(uformat);
9367 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
9370 return NULL;
9371}
9372
Jeremy Hylton938ace62002-07-17 16:30:39 +00009373static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009374unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9375
Tim Peters6d6c1a32001-08-02 04:15:00 +00009376static PyObject *
9377unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9378{
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380 static char *kwlist[] = {"object", "encoding", "errors", 0};
9381 char *encoding = NULL;
9382 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009383
Benjamin Peterson14339b62009-01-31 16:36:08 +00009384 if (type != &PyUnicode_Type)
9385 return unicode_subtype_new(type, args, kwds);
9386 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009388 return NULL;
9389 if (x == NULL)
9390 return (PyObject *)_PyUnicode_New(0);
9391 if (encoding == NULL && errors == NULL)
9392 return PyObject_Str(x);
9393 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009395}
9396
Guido van Rossume023fe02001-08-30 03:12:59 +00009397static PyObject *
9398unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9399{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009400 PyUnicodeObject *tmp, *pnew;
9401 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009402
Benjamin Peterson14339b62009-01-31 16:36:08 +00009403 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9404 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9405 if (tmp == NULL)
9406 return NULL;
9407 assert(PyUnicode_Check(tmp));
9408 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9409 if (pnew == NULL) {
9410 Py_DECREF(tmp);
9411 return NULL;
9412 }
9413 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9414 if (pnew->str == NULL) {
9415 _Py_ForgetReference((PyObject *)pnew);
9416 PyObject_Del(pnew);
9417 Py_DECREF(tmp);
9418 return PyErr_NoMemory();
9419 }
9420 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9421 pnew->length = n;
9422 pnew->hash = tmp->hash;
9423 Py_DECREF(tmp);
9424 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009425}
9426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009427PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009429\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009430Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009431encoding defaults to the current default string encoding.\n\
9432errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009433
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009434static PyObject *unicode_iter(PyObject *seq);
9435
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009437 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 "str", /* tp_name */
9439 sizeof(PyUnicodeObject), /* tp_size */
9440 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009442 (destructor)unicode_dealloc, /* tp_dealloc */
9443 0, /* tp_print */
9444 0, /* tp_getattr */
9445 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009446 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009447 unicode_repr, /* tp_repr */
9448 &unicode_as_number, /* tp_as_number */
9449 &unicode_as_sequence, /* tp_as_sequence */
9450 &unicode_as_mapping, /* tp_as_mapping */
9451 (hashfunc) unicode_hash, /* tp_hash*/
9452 0, /* tp_call*/
9453 (reprfunc) unicode_str, /* tp_str */
9454 PyObject_GenericGetAttr, /* tp_getattro */
9455 0, /* tp_setattro */
9456 0, /* tp_as_buffer */
9457 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009459 unicode_doc, /* tp_doc */
9460 0, /* tp_traverse */
9461 0, /* tp_clear */
9462 PyUnicode_RichCompare, /* tp_richcompare */
9463 0, /* tp_weaklistoffset */
9464 unicode_iter, /* tp_iter */
9465 0, /* tp_iternext */
9466 unicode_methods, /* tp_methods */
9467 0, /* tp_members */
9468 0, /* tp_getset */
9469 &PyBaseObject_Type, /* tp_base */
9470 0, /* tp_dict */
9471 0, /* tp_descr_get */
9472 0, /* tp_descr_set */
9473 0, /* tp_dictoffset */
9474 0, /* tp_init */
9475 0, /* tp_alloc */
9476 unicode_new, /* tp_new */
9477 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478};
9479
9480/* Initialize the Unicode implementation */
9481
Thomas Wouters78890102000-07-22 19:25:51 +00009482void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009484 int i;
9485
Thomas Wouters477c8d52006-05-27 19:21:47 +00009486 /* XXX - move this array to unicodectype.c ? */
9487 Py_UNICODE linebreak[] = {
9488 0x000A, /* LINE FEED */
9489 0x000D, /* CARRIAGE RETURN */
9490 0x001C, /* FILE SEPARATOR */
9491 0x001D, /* GROUP SEPARATOR */
9492 0x001E, /* RECORD SEPARATOR */
9493 0x0085, /* NEXT LINE */
9494 0x2028, /* LINE SEPARATOR */
9495 0x2029, /* PARAGRAPH SEPARATOR */
9496 };
9497
Fred Drakee4315f52000-05-09 19:53:39 +00009498 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009499 free_list = NULL;
9500 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009502 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009504
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009505 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009507 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509
9510 /* initialize the linebreak bloom filter */
9511 bloom_linebreak = make_bloom_mask(
9512 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9513 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009514
9515 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516}
9517
9518/* Finalize the Unicode implementation */
9519
Christian Heimesa156e092008-02-16 07:38:31 +00009520int
9521PyUnicode_ClearFreeList(void)
9522{
9523 int freelist_size = numfree;
9524 PyUnicodeObject *u;
9525
9526 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 PyUnicodeObject *v = u;
9528 u = *(PyUnicodeObject **)u;
9529 if (v->str)
9530 PyObject_DEL(v->str);
9531 Py_XDECREF(v->defenc);
9532 PyObject_Del(v);
9533 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009534 }
9535 free_list = NULL;
9536 assert(numfree == 0);
9537 return freelist_size;
9538}
9539
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540void
Thomas Wouters78890102000-07-22 19:25:51 +00009541_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009543 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009545 Py_XDECREF(unicode_empty);
9546 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009547
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009548 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 if (unicode_latin1[i]) {
9550 Py_DECREF(unicode_latin1[i]);
9551 unicode_latin1[i] = NULL;
9552 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009553 }
Christian Heimesa156e092008-02-16 07:38:31 +00009554 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009556
Walter Dörwald16807132007-05-25 13:52:07 +00009557void
9558PyUnicode_InternInPlace(PyObject **p)
9559{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009560 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9561 PyObject *t;
9562 if (s == NULL || !PyUnicode_Check(s))
9563 Py_FatalError(
9564 "PyUnicode_InternInPlace: unicode strings only please!");
9565 /* If it's a subclass, we don't really know what putting
9566 it in the interned dict might do. */
9567 if (!PyUnicode_CheckExact(s))
9568 return;
9569 if (PyUnicode_CHECK_INTERNED(s))
9570 return;
9571 if (interned == NULL) {
9572 interned = PyDict_New();
9573 if (interned == NULL) {
9574 PyErr_Clear(); /* Don't leave an exception */
9575 return;
9576 }
9577 }
9578 /* It might be that the GetItem call fails even
9579 though the key is present in the dictionary,
9580 namely when this happens during a stack overflow. */
9581 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009584
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 if (t) {
9586 Py_INCREF(t);
9587 Py_DECREF(*p);
9588 *p = t;
9589 return;
9590 }
Walter Dörwald16807132007-05-25 13:52:07 +00009591
Benjamin Peterson14339b62009-01-31 16:36:08 +00009592 PyThreadState_GET()->recursion_critical = 1;
9593 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9594 PyErr_Clear();
9595 PyThreadState_GET()->recursion_critical = 0;
9596 return;
9597 }
9598 PyThreadState_GET()->recursion_critical = 0;
9599 /* The two references in interned are not counted by refcnt.
9600 The deallocator will take care of this */
9601 Py_REFCNT(s) -= 2;
9602 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009603}
9604
9605void
9606PyUnicode_InternImmortal(PyObject **p)
9607{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009608 PyUnicode_InternInPlace(p);
9609 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9610 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9611 Py_INCREF(*p);
9612 }
Walter Dörwald16807132007-05-25 13:52:07 +00009613}
9614
9615PyObject *
9616PyUnicode_InternFromString(const char *cp)
9617{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 PyObject *s = PyUnicode_FromString(cp);
9619 if (s == NULL)
9620 return NULL;
9621 PyUnicode_InternInPlace(&s);
9622 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009623}
9624
9625void _Py_ReleaseInternedUnicodeStrings(void)
9626{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 PyObject *keys;
9628 PyUnicodeObject *s;
9629 Py_ssize_t i, n;
9630 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009631
Benjamin Peterson14339b62009-01-31 16:36:08 +00009632 if (interned == NULL || !PyDict_Check(interned))
9633 return;
9634 keys = PyDict_Keys(interned);
9635 if (keys == NULL || !PyList_Check(keys)) {
9636 PyErr_Clear();
9637 return;
9638 }
Walter Dörwald16807132007-05-25 13:52:07 +00009639
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9641 detector, interned unicode strings are not forcibly deallocated;
9642 rather, we give them their stolen references back, and then clear
9643 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009644
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 n = PyList_GET_SIZE(keys);
9646 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009648 for (i = 0; i < n; i++) {
9649 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9650 switch (s->state) {
9651 case SSTATE_NOT_INTERNED:
9652 /* XXX Shouldn't happen */
9653 break;
9654 case SSTATE_INTERNED_IMMORTAL:
9655 Py_REFCNT(s) += 1;
9656 immortal_size += s->length;
9657 break;
9658 case SSTATE_INTERNED_MORTAL:
9659 Py_REFCNT(s) += 2;
9660 mortal_size += s->length;
9661 break;
9662 default:
9663 Py_FatalError("Inconsistent interned string state.");
9664 }
9665 s->state = SSTATE_NOT_INTERNED;
9666 }
9667 fprintf(stderr, "total size of all interned strings: "
9668 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9669 "mortal/immortal\n", mortal_size, immortal_size);
9670 Py_DECREF(keys);
9671 PyDict_Clear(interned);
9672 Py_DECREF(interned);
9673 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009674}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009675
9676
9677/********************* Unicode Iterator **************************/
9678
9679typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009680 PyObject_HEAD
9681 Py_ssize_t it_index;
9682 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009683} unicodeiterobject;
9684
9685static void
9686unicodeiter_dealloc(unicodeiterobject *it)
9687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009688 _PyObject_GC_UNTRACK(it);
9689 Py_XDECREF(it->it_seq);
9690 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009691}
9692
9693static int
9694unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9695{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 Py_VISIT(it->it_seq);
9697 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009698}
9699
9700static PyObject *
9701unicodeiter_next(unicodeiterobject *it)
9702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 PyUnicodeObject *seq;
9704 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009705
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 assert(it != NULL);
9707 seq = it->it_seq;
9708 if (seq == NULL)
9709 return NULL;
9710 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009711
Benjamin Peterson14339b62009-01-31 16:36:08 +00009712 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9713 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009715 if (item != NULL)
9716 ++it->it_index;
9717 return item;
9718 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009719
Benjamin Peterson14339b62009-01-31 16:36:08 +00009720 Py_DECREF(seq);
9721 it->it_seq = NULL;
9722 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009723}
9724
9725static PyObject *
9726unicodeiter_len(unicodeiterobject *it)
9727{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009728 Py_ssize_t len = 0;
9729 if (it->it_seq)
9730 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9731 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009732}
9733
9734PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9735
9736static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009740};
9741
9742PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9744 "str_iterator", /* tp_name */
9745 sizeof(unicodeiterobject), /* tp_basicsize */
9746 0, /* tp_itemsize */
9747 /* methods */
9748 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9749 0, /* tp_print */
9750 0, /* tp_getattr */
9751 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009752 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009753 0, /* tp_repr */
9754 0, /* tp_as_number */
9755 0, /* tp_as_sequence */
9756 0, /* tp_as_mapping */
9757 0, /* tp_hash */
9758 0, /* tp_call */
9759 0, /* tp_str */
9760 PyObject_GenericGetAttr, /* tp_getattro */
9761 0, /* tp_setattro */
9762 0, /* tp_as_buffer */
9763 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9764 0, /* tp_doc */
9765 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9766 0, /* tp_clear */
9767 0, /* tp_richcompare */
9768 0, /* tp_weaklistoffset */
9769 PyObject_SelfIter, /* tp_iter */
9770 (iternextfunc)unicodeiter_next, /* tp_iternext */
9771 unicodeiter_methods, /* tp_methods */
9772 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009773};
9774
9775static PyObject *
9776unicode_iter(PyObject *seq)
9777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009779
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 if (!PyUnicode_Check(seq)) {
9781 PyErr_BadInternalCall();
9782 return NULL;
9783 }
9784 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9785 if (it == NULL)
9786 return NULL;
9787 it->it_index = 0;
9788 Py_INCREF(seq);
9789 it->it_seq = (PyUnicodeObject *)seq;
9790 _PyObject_GC_TRACK(it);
9791 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009792}
9793
Martin v. Löwis5b222132007-06-10 09:51:05 +00009794size_t
9795Py_UNICODE_strlen(const Py_UNICODE *u)
9796{
9797 int res = 0;
9798 while(*u++)
9799 res++;
9800 return res;
9801}
9802
9803Py_UNICODE*
9804Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9805{
9806 Py_UNICODE *u = s1;
9807 while ((*u++ = *s2++));
9808 return s1;
9809}
9810
9811Py_UNICODE*
9812Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9813{
9814 Py_UNICODE *u = s1;
9815 while ((*u++ = *s2++))
9816 if (n-- == 0)
9817 break;
9818 return s1;
9819}
9820
9821int
9822Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9823{
9824 while (*s1 && *s2 && *s1 == *s2)
9825 s1++, s2++;
9826 if (*s1 && *s2)
9827 return (*s1 < *s2) ? -1 : +1;
9828 if (*s1)
9829 return 1;
9830 if (*s2)
9831 return -1;
9832 return 0;
9833}
9834
9835Py_UNICODE*
9836Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9837{
9838 const Py_UNICODE *p;
9839 for (p = s; *p; p++)
9840 if (*p == c)
9841 return (Py_UNICODE*)p;
9842 return NULL;
9843}
9844
9845
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009846#ifdef __cplusplus
9847}
9848#endif