blob: 594623c0bc5551cc995483c1af6dfeef3db2554d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
972 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001018 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001019 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001020 /* slow path to check for character overflow */
1021 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1022 Py_UCS4 ch, maxchar;
1023 Py_ssize_t i;
1024
1025 maxchar = 0;
1026 invalid_kinds = 0;
1027 for (i=0; i < how_many; i++) {
1028 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1029 if (ch > maxchar) {
1030 maxchar = ch;
1031 if (maxchar > to_maxchar) {
1032 invalid_kinds = 1;
1033 break;
1034 }
1035 }
1036 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1037 }
1038 }
1039 else
1040 invalid_kinds = 1;
1041 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001042 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 "Cannot copy %s characters "
1044 "into a string of %s characters",
1045 unicode_kind_name(from),
1046 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001047 return -1;
1048 }
1049 }
1050 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051}
1052
Victor Stinner17222162011-09-28 22:15:37 +02001053/* Find the maximum code point and count the number of surrogate pairs so a
1054 correct string length can be computed before converting a string to UCS4.
1055 This function counts single surrogates as a character and not as a pair.
1056
1057 Return 0 on success, or -1 on error. */
1058static int
1059find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1060 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061{
1062 const wchar_t *iter;
1063
Victor Stinnerc53be962011-10-02 21:33:54 +02001064 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 if (num_surrogates == NULL || maxchar == NULL) {
1066 PyErr_SetString(PyExc_SystemError,
1067 "unexpected NULL arguments to "
1068 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1069 return -1;
1070 }
1071
1072 *num_surrogates = 0;
1073 *maxchar = 0;
1074
1075 for (iter = begin; iter < end; ) {
1076 if (*iter > *maxchar)
1077 *maxchar = *iter;
1078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
1518_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject *res;
1521 unsigned char max = 127;
1522 Py_ssize_t i;
1523 for (i = 0; i < size; i++) {
1524 if (u[i] & 0x80) {
1525 max = 255;
1526 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001527 }
1528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 res = PyUnicode_New(size, max);
1530 if (!res)
1531 return NULL;
1532 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1533 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001534}
1535
Victor Stinnere57b1c02011-09-28 22:20:48 +02001536static PyObject*
1537_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538{
1539 PyObject *res;
1540 Py_UCS2 max = 0;
1541 Py_ssize_t i;
1542 for (i = 0; i < size; i++)
1543 if (u[i] > max)
1544 max = u[i];
1545 res = PyUnicode_New(size, max);
1546 if (!res)
1547 return NULL;
1548 if (max >= 256)
1549 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1550 else
1551 for (i = 0; i < size; i++)
1552 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1553 return res;
1554}
1555
Victor Stinnere57b1c02011-09-28 22:20:48 +02001556static PyObject*
1557_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558{
1559 PyObject *res;
1560 Py_UCS4 max = 0;
1561 Py_ssize_t i;
1562 for (i = 0; i < size; i++)
1563 if (u[i] > max)
1564 max = u[i];
1565 res = PyUnicode_New(size, max);
1566 if (!res)
1567 return NULL;
1568 if (max >= 0x10000)
1569 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1570 else {
1571 int kind = PyUnicode_KIND(res);
1572 void *data = PyUnicode_DATA(res);
1573 for (i = 0; i < size; i++)
1574 PyUnicode_WRITE(kind, data, i, u[i]);
1575 }
1576 return res;
1577}
1578
1579PyObject*
1580PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1581{
1582 switch(kind) {
1583 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001584 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001586 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001588 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 }
Victor Stinner01698042011-10-04 00:04:26 +02001590 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 return NULL;
1592}
1593
Victor Stinner034f6cf2011-09-30 02:26:44 +02001594PyObject*
1595PyUnicode_Copy(PyObject *unicode)
1596{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001597 Py_ssize_t size;
1598 PyObject *copy;
1599 void *data;
1600
Victor Stinner034f6cf2011-09-30 02:26:44 +02001601 if (!PyUnicode_Check(unicode)) {
1602 PyErr_BadInternalCall();
1603 return NULL;
1604 }
1605 if (PyUnicode_READY(unicode))
1606 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001607
1608 size = PyUnicode_GET_LENGTH(unicode);
1609 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1610 if (!copy)
1611 return NULL;
1612 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1613
1614 data = PyUnicode_DATA(unicode);
1615 switch (PyUnicode_KIND(unicode))
1616 {
1617 case PyUnicode_1BYTE_KIND:
1618 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1619 break;
1620 case PyUnicode_2BYTE_KIND:
1621 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1622 break;
1623 case PyUnicode_4BYTE_KIND:
1624 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1625 break;
1626 default:
1627 assert(0);
1628 break;
1629 }
1630 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001631}
1632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634/* Widen Unicode objects to larger buffers. Don't write terminating null
1635 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636
1637void*
1638_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1639{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001640 Py_ssize_t len;
1641 void *result;
1642 unsigned int skind;
1643
1644 if (PyUnicode_READY(s))
1645 return NULL;
1646
1647 len = PyUnicode_GET_LENGTH(s);
1648 skind = PyUnicode_KIND(s);
1649 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001650 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return NULL;
1652 }
1653 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001654 case PyUnicode_2BYTE_KIND:
1655 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1656 if (!result)
1657 return PyErr_NoMemory();
1658 assert(skind == PyUnicode_1BYTE_KIND);
1659 _PyUnicode_CONVERT_BYTES(
1660 Py_UCS1, Py_UCS2,
1661 PyUnicode_1BYTE_DATA(s),
1662 PyUnicode_1BYTE_DATA(s) + len,
1663 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001665 case PyUnicode_4BYTE_KIND:
1666 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1667 if (!result)
1668 return PyErr_NoMemory();
1669 if (skind == PyUnicode_2BYTE_KIND) {
1670 _PyUnicode_CONVERT_BYTES(
1671 Py_UCS2, Py_UCS4,
1672 PyUnicode_2BYTE_DATA(s),
1673 PyUnicode_2BYTE_DATA(s) + len,
1674 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676 else {
1677 assert(skind == PyUnicode_1BYTE_KIND);
1678 _PyUnicode_CONVERT_BYTES(
1679 Py_UCS1, Py_UCS4,
1680 PyUnicode_1BYTE_DATA(s),
1681 PyUnicode_1BYTE_DATA(s) + len,
1682 result);
1683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001685 default:
1686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
Victor Stinner01698042011-10-04 00:04:26 +02001688 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 return NULL;
1690}
1691
1692static Py_UCS4*
1693as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1694 int copy_null)
1695{
1696 int kind;
1697 void *data;
1698 Py_ssize_t len, targetlen;
1699 if (PyUnicode_READY(string) == -1)
1700 return NULL;
1701 kind = PyUnicode_KIND(string);
1702 data = PyUnicode_DATA(string);
1703 len = PyUnicode_GET_LENGTH(string);
1704 targetlen = len;
1705 if (copy_null)
1706 targetlen++;
1707 if (!target) {
1708 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1709 PyErr_NoMemory();
1710 return NULL;
1711 }
1712 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1713 if (!target) {
1714 PyErr_NoMemory();
1715 return NULL;
1716 }
1717 }
1718 else {
1719 if (targetsize < targetlen) {
1720 PyErr_Format(PyExc_SystemError,
1721 "string is longer than the buffer");
1722 if (copy_null && 0 < targetsize)
1723 target[0] = 0;
1724 return NULL;
1725 }
1726 }
1727 if (kind != PyUnicode_4BYTE_KIND) {
1728 Py_ssize_t i;
1729 for (i = 0; i < len; i++)
1730 target[i] = PyUnicode_READ(kind, data, i);
1731 }
1732 else
1733 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1734 if (copy_null)
1735 target[len] = 0;
1736 return target;
1737}
1738
1739Py_UCS4*
1740PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1741 int copy_null)
1742{
1743 if (target == NULL || targetsize < 1) {
1744 PyErr_BadInternalCall();
1745 return NULL;
1746 }
1747 return as_ucs4(string, target, targetsize, copy_null);
1748}
1749
1750Py_UCS4*
1751PyUnicode_AsUCS4Copy(PyObject *string)
1752{
1753 return as_ucs4(string, NULL, 0, 1);
1754}
1755
1756#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757
Alexander Belopolsky40018472011-02-26 01:02:56 +00001758PyObject *
1759PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001762 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 PyErr_BadInternalCall();
1765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
1767
Martin v. Löwis790465f2008-04-05 20:41:37 +00001768 if (size == -1) {
1769 size = wcslen(w);
1770 }
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776
Walter Dörwald346737f2007-05-31 10:44:43 +00001777static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001778makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1779 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 *fmt++ = '%';
1782 if (width) {
1783 if (zeropad)
1784 *fmt++ = '0';
1785 fmt += sprintf(fmt, "%d", width);
1786 }
1787 if (precision)
1788 fmt += sprintf(fmt, ".%d", precision);
1789 if (longflag)
1790 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001791 else if (longlongflag) {
1792 /* longlongflag should only ever be nonzero on machines with
1793 HAVE_LONG_LONG defined */
1794#ifdef HAVE_LONG_LONG
1795 char *f = PY_FORMAT_LONG_LONG;
1796 while (*f)
1797 *fmt++ = *f++;
1798#else
1799 /* we shouldn't ever get here */
1800 assert(0);
1801 *fmt++ = 'l';
1802#endif
1803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 else if (size_tflag) {
1805 char *f = PY_FORMAT_SIZE_T;
1806 while (*f)
1807 *fmt++ = *f++;
1808 }
1809 *fmt++ = c;
1810 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001811}
1812
Victor Stinner96865452011-03-01 23:44:09 +00001813/* helper for PyUnicode_FromFormatV() */
1814
1815static const char*
1816parse_format_flags(const char *f,
1817 int *p_width, int *p_precision,
1818 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1819{
1820 int width, precision, longflag, longlongflag, size_tflag;
1821
1822 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1823 f++;
1824 width = 0;
1825 while (Py_ISDIGIT((unsigned)*f))
1826 width = (width*10) + *f++ - '0';
1827 precision = 0;
1828 if (*f == '.') {
1829 f++;
1830 while (Py_ISDIGIT((unsigned)*f))
1831 precision = (precision*10) + *f++ - '0';
1832 if (*f == '%') {
1833 /* "%.3%s" => f points to "3" */
1834 f--;
1835 }
1836 }
1837 if (*f == '\0') {
1838 /* bogus format "%.1" => go backward, f points to "1" */
1839 f--;
1840 }
1841 if (p_width != NULL)
1842 *p_width = width;
1843 if (p_precision != NULL)
1844 *p_precision = precision;
1845
1846 /* Handle %ld, %lu, %lld and %llu. */
1847 longflag = 0;
1848 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001849 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001850
1851 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001853 longflag = 1;
1854 ++f;
1855 }
1856#ifdef HAVE_LONG_LONG
1857 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001858 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001859 longlongflag = 1;
1860 f += 2;
1861 }
1862#endif
1863 }
1864 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001866 size_tflag = 1;
1867 ++f;
1868 }
1869 if (p_longflag != NULL)
1870 *p_longflag = longflag;
1871 if (p_longlongflag != NULL)
1872 *p_longlongflag = longlongflag;
1873 if (p_size_tflag != NULL)
1874 *p_size_tflag = size_tflag;
1875 return f;
1876}
1877
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001878/* maximum number of characters required for output of %ld. 21 characters
1879 allows for 64-bit integers (in decimal) and an optional sign. */
1880#define MAX_LONG_CHARS 21
1881/* maximum number of characters required for output of %lld.
1882 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1883 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1884#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1885
Walter Dörwaldd2034312007-05-18 16:29:38 +00001886PyObject *
1887PyUnicode_FromFormatV(const char *format, va_list vargs)
1888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 va_list count;
1890 Py_ssize_t callcount = 0;
1891 PyObject **callresults = NULL;
1892 PyObject **callresult = NULL;
1893 Py_ssize_t n = 0;
1894 int width = 0;
1895 int precision = 0;
1896 int zeropad;
1897 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001900 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1902 Py_UCS4 argmaxchar;
1903 Py_ssize_t numbersize = 0;
1904 char *numberresults = NULL;
1905 char *numberresult = NULL;
1906 Py_ssize_t i;
1907 int kind;
1908 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001910 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001911 /* step 1: count the number of %S/%R/%A/%s format specifications
1912 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1913 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 * result in an array)
1915 * also esimate a upper bound for all the number formats in the string,
1916 * numbers will be formated in step 3 and be keept in a '\0'-separated
1917 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 for (f = format; *f; f++) {
1919 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001920 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1922 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1923 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1924 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001927#ifdef HAVE_LONG_LONG
1928 if (longlongflag) {
1929 if (width < MAX_LONG_LONG_CHARS)
1930 width = MAX_LONG_LONG_CHARS;
1931 }
1932 else
1933#endif
1934 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1935 including sign. Decimal takes the most space. This
1936 isn't enough for octal. If a width is specified we
1937 need more (which we allocate later). */
1938 if (width < MAX_LONG_CHARS)
1939 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940
1941 /* account for the size + '\0' to separate numbers
1942 inside of the numberresults buffer */
1943 numbersize += (width + 1);
1944 }
1945 }
1946 else if ((unsigned char)*f > 127) {
1947 PyErr_Format(PyExc_ValueError,
1948 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1949 "string, got a non-ASCII byte: 0x%02x",
1950 (unsigned char)*f);
1951 return NULL;
1952 }
1953 }
1954 /* step 2: allocate memory for the results of
1955 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1956 if (callcount) {
1957 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1958 if (!callresults) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 callresult = callresults;
1963 }
1964 /* step 2.5: allocate memory for the results of formating numbers */
1965 if (numbersize) {
1966 numberresults = PyObject_Malloc(numbersize);
1967 if (!numberresults) {
1968 PyErr_NoMemory();
1969 goto fail;
1970 }
1971 numberresult = numberresults;
1972 }
1973
1974 /* step 3: format numbers and figure out how large a buffer we need */
1975 for (f = format; *f; f++) {
1976 if (*f == '%') {
1977 const char* p;
1978 int longflag;
1979 int longlongflag;
1980 int size_tflag;
1981 int numprinted;
1982
1983 p = f;
1984 zeropad = (f[1] == '0');
1985 f = parse_format_flags(f, &width, &precision,
1986 &longflag, &longlongflag, &size_tflag);
1987 switch (*f) {
1988 case 'c':
1989 {
1990 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001991 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 n++;
1993 break;
1994 }
1995 case '%':
1996 n++;
1997 break;
1998 case 'i':
1999 case 'd':
2000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2001 width, precision, *f);
2002 if (longflag)
2003 numprinted = sprintf(numberresult, fmt,
2004 va_arg(count, long));
2005#ifdef HAVE_LONG_LONG
2006 else if (longlongflag)
2007 numprinted = sprintf(numberresult, fmt,
2008 va_arg(count, PY_LONG_LONG));
2009#endif
2010 else if (size_tflag)
2011 numprinted = sprintf(numberresult, fmt,
2012 va_arg(count, Py_ssize_t));
2013 else
2014 numprinted = sprintf(numberresult, fmt,
2015 va_arg(count, int));
2016 n += numprinted;
2017 /* advance by +1 to skip over the '\0' */
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
2023 break;
2024 case 'u':
2025 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2026 width, precision, 'u');
2027 if (longflag)
2028 numprinted = sprintf(numberresult, fmt,
2029 va_arg(count, unsigned long));
2030#ifdef HAVE_LONG_LONG
2031 else if (longlongflag)
2032 numprinted = sprintf(numberresult, fmt,
2033 va_arg(count, unsigned PY_LONG_LONG));
2034#endif
2035 else if (size_tflag)
2036 numprinted = sprintf(numberresult, fmt,
2037 va_arg(count, size_t));
2038 else
2039 numprinted = sprintf(numberresult, fmt,
2040 va_arg(count, unsigned int));
2041 n += numprinted;
2042 numberresult += (numprinted + 1);
2043 assert(*(numberresult - 1) == '\0');
2044 assert(*(numberresult - 2) != '\0');
2045 assert(numprinted >= 0);
2046 assert(numberresult <= numberresults + numbersize);
2047 break;
2048 case 'x':
2049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2050 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2051 n += numprinted;
2052 numberresult += (numprinted + 1);
2053 assert(*(numberresult - 1) == '\0');
2054 assert(*(numberresult - 2) != '\0');
2055 assert(numprinted >= 0);
2056 assert(numberresult <= numberresults + numbersize);
2057 break;
2058 case 'p':
2059 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2060 /* %p is ill-defined: ensure leading 0x. */
2061 if (numberresult[1] == 'X')
2062 numberresult[1] = 'x';
2063 else if (numberresult[1] != 'x') {
2064 memmove(numberresult + 2, numberresult,
2065 strlen(numberresult) + 1);
2066 numberresult[0] = '0';
2067 numberresult[1] = 'x';
2068 numprinted += 2;
2069 }
2070 n += numprinted;
2071 numberresult += (numprinted + 1);
2072 assert(*(numberresult - 1) == '\0');
2073 assert(*(numberresult - 2) != '\0');
2074 assert(numprinted >= 0);
2075 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 break;
2077 case 's':
2078 {
2079 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002080 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002081 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2082 if (!str)
2083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 /* since PyUnicode_DecodeUTF8 returns already flexible
2085 unicode objects, there is no need to call ready on them */
2086 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002087 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002089 /* Remember the str and switch to the next slot */
2090 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 break;
2092 }
2093 case 'U':
2094 {
2095 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002096 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (PyUnicode_READY(obj) == -1)
2098 goto fail;
2099 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002100 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 break;
2103 }
2104 case 'V':
2105 {
2106 PyObject *obj = va_arg(count, PyObject *);
2107 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002108 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002110 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002111 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 if (PyUnicode_READY(obj) == -1)
2113 goto fail;
2114 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002115 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002117 *callresult++ = NULL;
2118 }
2119 else {
2120 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2121 if (!str_obj)
2122 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002124 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002126 *callresult++ = str_obj;
2127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 break;
2129 }
2130 case 'S':
2131 {
2132 PyObject *obj = va_arg(count, PyObject *);
2133 PyObject *str;
2134 assert(obj);
2135 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002139 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 /* Remember the str and switch to the next slot */
2142 *callresult++ = str;
2143 break;
2144 }
2145 case 'R':
2146 {
2147 PyObject *obj = va_arg(count, PyObject *);
2148 PyObject *repr;
2149 assert(obj);
2150 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002154 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 /* Remember the repr and switch to the next slot */
2157 *callresult++ = repr;
2158 break;
2159 }
2160 case 'A':
2161 {
2162 PyObject *obj = va_arg(count, PyObject *);
2163 PyObject *ascii;
2164 assert(obj);
2165 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002169 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 /* Remember the repr and switch to the next slot */
2172 *callresult++ = ascii;
2173 break;
2174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002175 default:
2176 /* if we stumble upon an unknown
2177 formatting code, copy the rest of
2178 the format string to the output
2179 string. (we cannot just skip the
2180 code, since there's no way to know
2181 what's in the argument list) */
2182 n += strlen(p);
2183 goto expand;
2184 }
2185 } else
2186 n++;
2187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002191 we don't have to resize the string.
2192 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 if (!string)
2195 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 kind = PyUnicode_KIND(string);
2197 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002203 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002204
2205 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2207 /* checking for == because the last argument could be a empty
2208 string, which causes i to point to end, the assert at the end of
2209 the loop */
2210 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002211
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 switch (*f) {
2213 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002214 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 const int ordinal = va_arg(vargs, int);
2216 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002218 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 case 'p':
2224 /* unused, since we already have the result */
2225 if (*f == 'p')
2226 (void) va_arg(vargs, void *);
2227 else
2228 (void) va_arg(vargs, int);
2229 /* extract the result from numberresults and append. */
2230 for (; *numberresult; ++i, ++numberresult)
2231 PyUnicode_WRITE(kind, data, i, *numberresult);
2232 /* skip over the separating '\0' */
2233 assert(*numberresult == '\0');
2234 numberresult++;
2235 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 break;
2237 case 's':
2238 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002239 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002241 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 size = PyUnicode_GET_LENGTH(*callresult);
2243 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002244 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2245 *callresult, 0,
2246 size) < 0)
2247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002249 /* We're done with the unicode()/repr() => forget it */
2250 Py_DECREF(*callresult);
2251 /* switch to next unicode()/repr() result */
2252 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 break;
2254 }
2255 case 'U':
2256 {
2257 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 Py_ssize_t size;
2259 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2260 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002261 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2262 obj, 0,
2263 size) < 0)
2264 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 break;
2267 }
2268 case 'V':
2269 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002272 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 size = PyUnicode_GET_LENGTH(obj);
2275 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002276 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2277 obj, 0,
2278 size) < 0)
2279 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 size = PyUnicode_GET_LENGTH(*callresult);
2283 assert(PyUnicode_KIND(*callresult) <=
2284 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002285 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2286 *callresult,
2287 0, size) < 0)
2288 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002290 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002291 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002292 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 break;
2294 }
2295 case 'S':
2296 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002297 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 /* unused, since we already have the result */
2300 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002302 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2303 *callresult, 0,
2304 PyUnicode_GET_LENGTH(*callresult)) < 0)
2305 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 /* We're done with the unicode()/repr() => forget it */
2308 Py_DECREF(*callresult);
2309 /* switch to next unicode()/repr() result */
2310 ++callresult;
2311 break;
2312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 break;
2316 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 for (; *p; ++p, ++i)
2318 PyUnicode_WRITE(kind, data, i, *p);
2319 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 goto end;
2321 }
Victor Stinner1205f272010-09-11 00:54:47 +00002322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 else {
2324 assert(i < PyUnicode_GET_LENGTH(string));
2325 PyUnicode_WRITE(kind, data, i++, *f);
2326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002329
Benjamin Peterson29060642009-01-31 22:14:21 +00002330 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 if (callresults)
2332 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 if (numberresults)
2334 PyObject_Free(numberresults);
2335 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 if (callresults) {
2338 PyObject **callresult2 = callresults;
2339 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002340 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 ++callresult2;
2342 }
2343 PyObject_Free(callresults);
2344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (numberresults)
2346 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002348}
2349
Walter Dörwaldd2034312007-05-18 16:29:38 +00002350PyObject *
2351PyUnicode_FromFormat(const char *format, ...)
2352{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 PyObject* ret;
2354 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355
2356#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002358#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002360#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 ret = PyUnicode_FromFormatV(format, vargs);
2362 va_end(vargs);
2363 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002364}
2365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366#ifdef HAVE_WCHAR_H
2367
Victor Stinner5593d8a2010-10-02 11:11:27 +00002368/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2369 convert a Unicode object to a wide character string.
2370
Victor Stinnerd88d9832011-09-06 02:00:05 +02002371 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002372 character) required to convert the unicode object. Ignore size argument.
2373
Victor Stinnerd88d9832011-09-06 02:00:05 +02002374 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002375 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002376 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002377static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002378unicode_aswidechar(PyUnicodeObject *unicode,
2379 wchar_t *w,
2380 Py_ssize_t size)
2381{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002382 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 const wchar_t *wstr;
2384
2385 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2386 if (wstr == NULL)
2387 return -1;
2388
Victor Stinner5593d8a2010-10-02 11:11:27 +00002389 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002390 if (size > res)
2391 size = res + 1;
2392 else
2393 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002395 return res;
2396 }
2397 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002399}
2400
2401Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002402PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002403 wchar_t *w,
2404 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405{
2406 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 PyErr_BadInternalCall();
2408 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002410 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411}
2412
Victor Stinner137c34c2010-09-29 10:25:54 +00002413wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002414PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002415 Py_ssize_t *size)
2416{
2417 wchar_t* buffer;
2418 Py_ssize_t buflen;
2419
2420 if (unicode == NULL) {
2421 PyErr_BadInternalCall();
2422 return NULL;
2423 }
2424
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002425 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (buflen == -1)
2427 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002428 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002429 PyErr_NoMemory();
2430 return NULL;
2431 }
2432
Victor Stinner137c34c2010-09-29 10:25:54 +00002433 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2434 if (buffer == NULL) {
2435 PyErr_NoMemory();
2436 return NULL;
2437 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002438 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (buflen == -1)
2440 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441 if (size != NULL)
2442 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002443 return buffer;
2444}
2445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447
Alexander Belopolsky40018472011-02-26 01:02:56 +00002448PyObject *
2449PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002452 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 PyErr_SetString(PyExc_ValueError,
2454 "chr() arg not in range(0x110000)");
2455 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002456 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (ordinal < 256)
2459 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 v = PyUnicode_New(1, ordinal);
2462 if (v == NULL)
2463 return NULL;
2464 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2465 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002466}
2467
Alexander Belopolsky40018472011-02-26 01:02:56 +00002468PyObject *
2469PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002471 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002473 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002474 if (PyUnicode_READY(obj))
2475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 Py_INCREF(obj);
2477 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002478 }
2479 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 /* For a Unicode subtype that's not a Unicode object,
2481 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002482 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002483 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002484 PyErr_Format(PyExc_TypeError,
2485 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002486 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002487 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002488}
2489
Alexander Belopolsky40018472011-02-26 01:02:56 +00002490PyObject *
2491PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002492 const char *encoding,
2493 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002494{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002495 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002496 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 PyErr_BadInternalCall();
2500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002502
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002503 /* Decoding bytes objects is the most common case and should be fast */
2504 if (PyBytes_Check(obj)) {
2505 if (PyBytes_GET_SIZE(obj) == 0) {
2506 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002507 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002508 }
2509 else {
2510 v = PyUnicode_Decode(
2511 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2512 encoding, errors);
2513 }
2514 return v;
2515 }
2516
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002517 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 PyErr_SetString(PyExc_TypeError,
2519 "decoding str is not supported");
2520 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002522
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002523 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2524 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2525 PyErr_Format(PyExc_TypeError,
2526 "coercing to str: need bytes, bytearray "
2527 "or buffer-like object, %.80s found",
2528 Py_TYPE(obj)->tp_name);
2529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002530 }
Tim Petersced69f82003-09-16 20:30:58 +00002531
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002532 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002534 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
Tim Petersced69f82003-09-16 20:30:58 +00002536 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002537 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002538
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002539 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002540 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541}
2542
Victor Stinner600d3be2010-06-10 12:00:55 +00002543/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002544 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2545 1 on success. */
2546static int
2547normalize_encoding(const char *encoding,
2548 char *lower,
2549 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002551 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002552 char *l;
2553 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002555 e = encoding;
2556 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002557 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002558 while (*e) {
2559 if (l == l_end)
2560 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002561 if (Py_ISUPPER(*e)) {
2562 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002563 }
2564 else if (*e == '_') {
2565 *l++ = '-';
2566 e++;
2567 }
2568 else {
2569 *l++ = *e++;
2570 }
2571 }
2572 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002573 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002574}
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002578 Py_ssize_t size,
2579 const char *encoding,
2580 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002581{
2582 PyObject *buffer = NULL, *unicode;
2583 Py_buffer info;
2584 char lower[11]; /* Enough for any encoding shortcut */
2585
2586 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002588
2589 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002590 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002591 if ((strcmp(lower, "utf-8") == 0) ||
2592 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002593 return PyUnicode_DecodeUTF8(s, size, errors);
2594 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002595 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002596 (strcmp(lower, "iso-8859-1") == 0))
2597 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002598#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002599 else if (strcmp(lower, "mbcs") == 0)
2600 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002601#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002602 else if (strcmp(lower, "ascii") == 0)
2603 return PyUnicode_DecodeASCII(s, size, errors);
2604 else if (strcmp(lower, "utf-16") == 0)
2605 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2606 else if (strcmp(lower, "utf-32") == 0)
2607 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002611 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002612 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002613 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002614 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 if (buffer == NULL)
2616 goto onError;
2617 unicode = PyCodec_Decode(buffer, encoding, errors);
2618 if (unicode == NULL)
2619 goto onError;
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002622 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002623 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 Py_DECREF(unicode);
2625 goto onError;
2626 }
2627 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002628 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 Py_DECREF(unicode);
2630 return NULL;
2631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002633
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 Py_XDECREF(buffer);
2636 return NULL;
2637}
2638
Alexander Belopolsky40018472011-02-26 01:02:56 +00002639PyObject *
2640PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002641 const char *encoding,
2642 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002643{
2644 PyObject *v;
2645
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 goto onError;
2649 }
2650
2651 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002653
2654 /* Decode via the codec registry */
2655 v = PyCodec_Decode(unicode, encoding, errors);
2656 if (v == NULL)
2657 goto onError;
2658 return v;
2659
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002661 return NULL;
2662}
2663
Alexander Belopolsky40018472011-02-26 01:02:56 +00002664PyObject *
2665PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002666 const char *encoding,
2667 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002668{
2669 PyObject *v;
2670
2671 if (!PyUnicode_Check(unicode)) {
2672 PyErr_BadArgument();
2673 goto onError;
2674 }
2675
2676 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002678
2679 /* Decode via the codec registry */
2680 v = PyCodec_Decode(unicode, encoding, errors);
2681 if (v == NULL)
2682 goto onError;
2683 if (!PyUnicode_Check(v)) {
2684 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002685 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002686 Py_TYPE(v)->tp_name);
2687 Py_DECREF(v);
2688 goto onError;
2689 }
2690 return v;
2691
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002693 return NULL;
2694}
2695
Alexander Belopolsky40018472011-02-26 01:02:56 +00002696PyObject *
2697PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002698 Py_ssize_t size,
2699 const char *encoding,
2700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701{
2702 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 unicode = PyUnicode_FromUnicode(s, size);
2705 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2708 Py_DECREF(unicode);
2709 return v;
2710}
2711
Alexander Belopolsky40018472011-02-26 01:02:56 +00002712PyObject *
2713PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002714 const char *encoding,
2715 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002716{
2717 PyObject *v;
2718
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 goto onError;
2722 }
2723
2724 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002725 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002726
2727 /* Encode via the codec registry */
2728 v = PyCodec_Encode(unicode, encoding, errors);
2729 if (v == NULL)
2730 goto onError;
2731 return v;
2732
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002734 return NULL;
2735}
2736
Victor Stinnerad158722010-10-27 00:25:46 +00002737PyObject *
2738PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002739{
Victor Stinner99b95382011-07-04 14:23:54 +02002740#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2742 PyUnicode_GET_SIZE(unicode),
2743 NULL);
2744#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002746#else
Victor Stinner793b5312011-04-27 00:24:21 +02002747 PyInterpreterState *interp = PyThreadState_GET()->interp;
2748 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2749 cannot use it to encode and decode filenames before it is loaded. Load
2750 the Python codec requires to encode at least its own filename. Use the C
2751 version of the locale codec until the codec registry is initialized and
2752 the Python codec is loaded.
2753
2754 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2755 cannot only rely on it: check also interp->fscodec_initialized for
2756 subinterpreters. */
2757 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002758 return PyUnicode_AsEncodedString(unicode,
2759 Py_FileSystemDefaultEncoding,
2760 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002761 }
2762 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002763 /* locale encoding with surrogateescape */
2764 wchar_t *wchar;
2765 char *bytes;
2766 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002767 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002768
2769 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2770 if (wchar == NULL)
2771 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002772 bytes = _Py_wchar2char(wchar, &error_pos);
2773 if (bytes == NULL) {
2774 if (error_pos != (size_t)-1) {
2775 char *errmsg = strerror(errno);
2776 PyObject *exc = NULL;
2777 if (errmsg == NULL)
2778 errmsg = "Py_wchar2char() failed";
2779 raise_encode_exception(&exc,
2780 "filesystemencoding",
2781 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2782 error_pos, error_pos+1,
2783 errmsg);
2784 Py_XDECREF(exc);
2785 }
2786 else
2787 PyErr_NoMemory();
2788 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002789 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002790 }
2791 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002792
2793 bytes_obj = PyBytes_FromString(bytes);
2794 PyMem_Free(bytes);
2795 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002796 }
Victor Stinnerad158722010-10-27 00:25:46 +00002797#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002798}
2799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002802 const char *encoding,
2803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804{
2805 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002806 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002807
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 if (!PyUnicode_Check(unicode)) {
2809 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Fred Drakee4315f52000-05-09 19:53:39 +00002812
Victor Stinner2f283c22011-03-02 01:21:46 +00002813 if (encoding == NULL) {
2814 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002816 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002818 }
Fred Drakee4315f52000-05-09 19:53:39 +00002819
2820 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002821 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002822 if ((strcmp(lower, "utf-8") == 0) ||
2823 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002824 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002825 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002827 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002829 }
Victor Stinner37296e82010-06-10 13:36:23 +00002830 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002831 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002832 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002834#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002835 else if (strcmp(lower, "mbcs") == 0)
2836 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2837 PyUnicode_GET_SIZE(unicode),
2838 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002839#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002840 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843
2844 /* Encode via the codec registry */
2845 v = PyCodec_Encode(unicode, encoding, errors);
2846 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002847 return NULL;
2848
2849 /* The normal path */
2850 if (PyBytes_Check(v))
2851 return v;
2852
2853 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002854 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002855 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002856 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002857
2858 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2859 "encoder %s returned bytearray instead of bytes",
2860 encoding);
2861 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002862 Py_DECREF(v);
2863 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002864 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002865
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002866 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2867 Py_DECREF(v);
2868 return b;
2869 }
2870
2871 PyErr_Format(PyExc_TypeError,
2872 "encoder did not return a bytes object (type=%.400s)",
2873 Py_TYPE(v)->tp_name);
2874 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002875 return NULL;
2876}
2877
Alexander Belopolsky40018472011-02-26 01:02:56 +00002878PyObject *
2879PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002880 const char *encoding,
2881 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002882{
2883 PyObject *v;
2884
2885 if (!PyUnicode_Check(unicode)) {
2886 PyErr_BadArgument();
2887 goto onError;
2888 }
2889
2890 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002892
2893 /* Encode via the codec registry */
2894 v = PyCodec_Encode(unicode, encoding, errors);
2895 if (v == NULL)
2896 goto onError;
2897 if (!PyUnicode_Check(v)) {
2898 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002900 Py_TYPE(v)->tp_name);
2901 Py_DECREF(v);
2902 goto onError;
2903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002905
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 return NULL;
2908}
2909
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002910PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002911PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002912 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002913 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2914}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002915
Christian Heimes5894ba72007-11-04 11:43:14 +00002916PyObject*
2917PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2918{
Victor Stinner99b95382011-07-04 14:23:54 +02002919#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002920 return PyUnicode_DecodeMBCS(s, size, NULL);
2921#elif defined(__APPLE__)
2922 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2923#else
Victor Stinner793b5312011-04-27 00:24:21 +02002924 PyInterpreterState *interp = PyThreadState_GET()->interp;
2925 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2926 cannot use it to encode and decode filenames before it is loaded. Load
2927 the Python codec requires to encode at least its own filename. Use the C
2928 version of the locale codec until the codec registry is initialized and
2929 the Python codec is loaded.
2930
2931 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2932 cannot only rely on it: check also interp->fscodec_initialized for
2933 subinterpreters. */
2934 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002935 return PyUnicode_Decode(s, size,
2936 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002937 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002938 }
2939 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002940 /* locale encoding with surrogateescape */
2941 wchar_t *wchar;
2942 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002943 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002944
2945 if (s[size] != '\0' || size != strlen(s)) {
2946 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2947 return NULL;
2948 }
2949
Victor Stinner168e1172010-10-16 23:16:16 +00002950 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002951 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002952 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002953
Victor Stinner168e1172010-10-16 23:16:16 +00002954 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002955 PyMem_Free(wchar);
2956 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002957 }
Victor Stinnerad158722010-10-27 00:25:46 +00002958#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002959}
2960
Martin v. Löwis011e8422009-05-05 04:43:17 +00002961
2962int
2963PyUnicode_FSConverter(PyObject* arg, void* addr)
2964{
2965 PyObject *output = NULL;
2966 Py_ssize_t size;
2967 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002968 if (arg == NULL) {
2969 Py_DECREF(*(PyObject**)addr);
2970 return 1;
2971 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002972 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002973 output = arg;
2974 Py_INCREF(output);
2975 }
2976 else {
2977 arg = PyUnicode_FromObject(arg);
2978 if (!arg)
2979 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002980 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002981 Py_DECREF(arg);
2982 if (!output)
2983 return 0;
2984 if (!PyBytes_Check(output)) {
2985 Py_DECREF(output);
2986 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2987 return 0;
2988 }
2989 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002990 size = PyBytes_GET_SIZE(output);
2991 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002992 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002993 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002994 Py_DECREF(output);
2995 return 0;
2996 }
2997 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002998 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002999}
3000
3001
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003002int
3003PyUnicode_FSDecoder(PyObject* arg, void* addr)
3004{
3005 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003006 if (arg == NULL) {
3007 Py_DECREF(*(PyObject**)addr);
3008 return 1;
3009 }
3010 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003011 if (PyUnicode_READY(arg))
3012 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003013 output = arg;
3014 Py_INCREF(output);
3015 }
3016 else {
3017 arg = PyBytes_FromObject(arg);
3018 if (!arg)
3019 return 0;
3020 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3021 PyBytes_GET_SIZE(arg));
3022 Py_DECREF(arg);
3023 if (!output)
3024 return 0;
3025 if (!PyUnicode_Check(output)) {
3026 Py_DECREF(output);
3027 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3028 return 0;
3029 }
3030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3032 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003033 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3034 Py_DECREF(output);
3035 return 0;
3036 }
3037 *(PyObject**)addr = output;
3038 return Py_CLEANUP_SUPPORTED;
3039}
3040
3041
Martin v. Löwis5b222132007-06-10 09:51:05 +00003042char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003043PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003044{
Christian Heimesf3863112007-11-22 07:46:41 +00003045 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003046 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3047
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_BadArgument();
3050 return NULL;
3051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003052 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003055 if (PyUnicode_UTF8(unicode) == NULL) {
3056 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3058 if (bytes == NULL)
3059 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003060 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3061 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 Py_DECREF(bytes);
3063 return NULL;
3064 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003065 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3066 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003067 Py_DECREF(bytes);
3068 }
3069
3070 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003071 *psize = PyUnicode_UTF8_LENGTH(unicode);
3072 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003073}
3074
3075char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003078 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3079}
3080
3081#ifdef Py_DEBUG
3082int unicode_as_unicode_calls = 0;
3083#endif
3084
3085
3086Py_UNICODE *
3087PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3088{
3089 PyUnicodeObject *u;
3090 const unsigned char *one_byte;
3091#if SIZEOF_WCHAR_T == 4
3092 const Py_UCS2 *two_bytes;
3093#else
3094 const Py_UCS4 *four_bytes;
3095 const Py_UCS4 *ucs4_end;
3096 Py_ssize_t num_surrogates;
3097#endif
3098 wchar_t *w;
3099 wchar_t *wchar_end;
3100
3101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 return NULL;
3104 }
3105 u = (PyUnicodeObject*)unicode;
3106 if (_PyUnicode_WSTR(u) == NULL) {
3107 /* Non-ASCII compact unicode object */
3108 assert(_PyUnicode_KIND(u) != 0);
3109 assert(PyUnicode_IS_READY(u));
3110
3111#ifdef Py_DEBUG
3112 ++unicode_as_unicode_calls;
3113#endif
3114
3115 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3116#if SIZEOF_WCHAR_T == 2
3117 four_bytes = PyUnicode_4BYTE_DATA(u);
3118 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3119 num_surrogates = 0;
3120
3121 for (; four_bytes < ucs4_end; ++four_bytes) {
3122 if (*four_bytes > 0xFFFF)
3123 ++num_surrogates;
3124 }
3125
3126 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3127 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3128 if (!_PyUnicode_WSTR(u)) {
3129 PyErr_NoMemory();
3130 return NULL;
3131 }
3132 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3133
3134 w = _PyUnicode_WSTR(u);
3135 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3136 four_bytes = PyUnicode_4BYTE_DATA(u);
3137 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3138 if (*four_bytes > 0xFFFF) {
3139 /* encode surrogate pair in this case */
3140 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3141 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3142 }
3143 else
3144 *w = *four_bytes;
3145
3146 if (w > wchar_end) {
3147 assert(0 && "Miscalculated string end");
3148 }
3149 }
3150 *w = 0;
3151#else
3152 /* sizeof(wchar_t) == 4 */
3153 Py_FatalError("Impossible unicode object state, wstr and str "
3154 "should share memory already.");
3155 return NULL;
3156#endif
3157 }
3158 else {
3159 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3160 (_PyUnicode_LENGTH(u) + 1));
3161 if (!_PyUnicode_WSTR(u)) {
3162 PyErr_NoMemory();
3163 return NULL;
3164 }
3165 if (!PyUnicode_IS_COMPACT_ASCII(u))
3166 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3167 w = _PyUnicode_WSTR(u);
3168 wchar_end = w + _PyUnicode_LENGTH(u);
3169
3170 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3171 one_byte = PyUnicode_1BYTE_DATA(u);
3172 for (; w < wchar_end; ++one_byte, ++w)
3173 *w = *one_byte;
3174 /* null-terminate the wstr */
3175 *w = 0;
3176 }
3177 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3178#if SIZEOF_WCHAR_T == 4
3179 two_bytes = PyUnicode_2BYTE_DATA(u);
3180 for (; w < wchar_end; ++two_bytes, ++w)
3181 *w = *two_bytes;
3182 /* null-terminate the wstr */
3183 *w = 0;
3184#else
3185 /* sizeof(wchar_t) == 2 */
3186 PyObject_FREE(_PyUnicode_WSTR(u));
3187 _PyUnicode_WSTR(u) = NULL;
3188 Py_FatalError("Impossible unicode object state, wstr "
3189 "and str should share memory already.");
3190 return NULL;
3191#endif
3192 }
3193 else {
3194 assert(0 && "This should never happen.");
3195 }
3196 }
3197 }
3198 if (size != NULL)
3199 *size = PyUnicode_WSTR_LENGTH(u);
3200 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003201}
3202
Alexander Belopolsky40018472011-02-26 01:02:56 +00003203Py_UNICODE *
3204PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207}
3208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003209
Alexander Belopolsky40018472011-02-26 01:02:56 +00003210Py_ssize_t
3211PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212{
3213 if (!PyUnicode_Check(unicode)) {
3214 PyErr_BadArgument();
3215 goto onError;
3216 }
3217 return PyUnicode_GET_SIZE(unicode);
3218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 return -1;
3221}
3222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003223Py_ssize_t
3224PyUnicode_GetLength(PyObject *unicode)
3225{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003226 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003227 PyErr_BadArgument();
3228 return -1;
3229 }
3230
3231 return PyUnicode_GET_LENGTH(unicode);
3232}
3233
3234Py_UCS4
3235PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3236{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003237 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3238 PyErr_BadArgument();
3239 return (Py_UCS4)-1;
3240 }
3241 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3242 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003243 return (Py_UCS4)-1;
3244 }
3245 return PyUnicode_READ_CHAR(unicode, index);
3246}
3247
3248int
3249PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3250{
3251 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003252 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253 return -1;
3254 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003255 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3256 PyErr_SetString(PyExc_IndexError, "string index out of range");
3257 return -1;
3258 }
3259 if (_PyUnicode_Dirty(unicode))
3260 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003261 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3262 index, ch);
3263 return 0;
3264}
3265
Alexander Belopolsky40018472011-02-26 01:02:56 +00003266const char *
3267PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003268{
Victor Stinner42cb4622010-09-01 19:39:01 +00003269 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003270}
3271
Victor Stinner554f3f02010-06-16 23:33:54 +00003272/* create or adjust a UnicodeDecodeError */
3273static void
3274make_decode_exception(PyObject **exceptionObject,
3275 const char *encoding,
3276 const char *input, Py_ssize_t length,
3277 Py_ssize_t startpos, Py_ssize_t endpos,
3278 const char *reason)
3279{
3280 if (*exceptionObject == NULL) {
3281 *exceptionObject = PyUnicodeDecodeError_Create(
3282 encoding, input, length, startpos, endpos, reason);
3283 }
3284 else {
3285 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3286 goto onError;
3287 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3288 goto onError;
3289 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3290 goto onError;
3291 }
3292 return;
3293
3294onError:
3295 Py_DECREF(*exceptionObject);
3296 *exceptionObject = NULL;
3297}
3298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299/* error handling callback helper:
3300 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003301 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 and adjust various state variables.
3303 return 0 on success, -1 on error
3304*/
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306static int
3307unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 const char *encoding, const char *reason,
3309 const char **input, const char **inend, Py_ssize_t *startinpos,
3310 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3311 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003313 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314
3315 PyObject *restuple = NULL;
3316 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003318 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t requiredsize;
3320 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003322 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 int res = -1;
3325
3326 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 *errorHandler = PyCodec_LookupError(errors);
3328 if (*errorHandler == NULL)
3329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 }
3331
Victor Stinner554f3f02010-06-16 23:33:54 +00003332 make_decode_exception(exceptionObject,
3333 encoding,
3334 *input, *inend - *input,
3335 *startinpos, *endinpos,
3336 reason);
3337 if (*exceptionObject == NULL)
3338 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339
3340 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3341 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003344 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 }
3347 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003349
3350 /* Copy back the bytes variables, which might have been modified by the
3351 callback */
3352 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3353 if (!inputobj)
3354 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003355 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003356 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003358 *input = PyBytes_AS_STRING(inputobj);
3359 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003360 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003361 /* we can DECREF safely, as the exception has another reference,
3362 so the object won't go away. */
3363 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003367 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3369 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371
3372 /* need more space? (at least enough for what we
3373 have+the replacement+the rest of the string (starting
3374 at the new input position), so we won't have to check space
3375 when there are no errors in the rest of the string) */
3376 repptr = PyUnicode_AS_UNICODE(repunicode);
3377 repsize = PyUnicode_GET_SIZE(repunicode);
3378 requiredsize = *outpos + repsize + insize-newpos;
3379 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 if (requiredsize<2*outsize)
3381 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003382 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 goto onError;
3384 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 }
3386 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003387 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 Py_UNICODE_COPY(*outptr, repptr, repsize);
3389 *outptr += repsize;
3390 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 /* we made it! */
3393 res = 0;
3394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 Py_XDECREF(restuple);
3397 return res;
3398}
3399
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003400/* --- UTF-7 Codec -------------------------------------------------------- */
3401
Antoine Pitrou244651a2009-05-04 18:56:13 +00003402/* See RFC2152 for details. We encode conservatively and decode liberally. */
3403
3404/* Three simple macros defining base-64. */
3405
3406/* Is c a base-64 character? */
3407
3408#define IS_BASE64(c) \
3409 (((c) >= 'A' && (c) <= 'Z') || \
3410 ((c) >= 'a' && (c) <= 'z') || \
3411 ((c) >= '0' && (c) <= '9') || \
3412 (c) == '+' || (c) == '/')
3413
3414/* given that c is a base-64 character, what is its base-64 value? */
3415
3416#define FROM_BASE64(c) \
3417 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3418 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3419 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3420 (c) == '+' ? 62 : 63)
3421
3422/* What is the base-64 character of the bottom 6 bits of n? */
3423
3424#define TO_BASE64(n) \
3425 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3426
3427/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3428 * decoded as itself. We are permissive on decoding; the only ASCII
3429 * byte not decoding to itself is the + which begins a base64
3430 * string. */
3431
3432#define DECODE_DIRECT(c) \
3433 ((c) <= 127 && (c) != '+')
3434
3435/* The UTF-7 encoder treats ASCII characters differently according to
3436 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3437 * the above). See RFC2152. This array identifies these different
3438 * sets:
3439 * 0 : "Set D"
3440 * alphanumeric and '(),-./:?
3441 * 1 : "Set O"
3442 * !"#$%&*;<=>@[]^_`{|}
3443 * 2 : "whitespace"
3444 * ht nl cr sp
3445 * 3 : special (must be base64 encoded)
3446 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3447 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003448
Tim Petersced69f82003-09-16 20:30:58 +00003449static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003450char utf7_category[128] = {
3451/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3452 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3453/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3454 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3455/* sp ! " # $ % & ' ( ) * + , - . / */
3456 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3457/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3459/* @ A B C D E F G H I J K L M N O */
3460 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3463/* ` a b c d e f g h i j k l m n o */
3464 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465/* p q r s t u v w x y z { | } ~ del */
3466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003467};
3468
Antoine Pitrou244651a2009-05-04 18:56:13 +00003469/* ENCODE_DIRECT: this character should be encoded as itself. The
3470 * answer depends on whether we are encoding set O as itself, and also
3471 * on whether we are encoding whitespace as itself. RFC2152 makes it
3472 * clear that the answers to these questions vary between
3473 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003474
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475#define ENCODE_DIRECT(c, directO, directWS) \
3476 ((c) < 128 && (c) > 0 && \
3477 ((utf7_category[(c)] == 0) || \
3478 (directWS && (utf7_category[(c)] == 2)) || \
3479 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003480
Alexander Belopolsky40018472011-02-26 01:02:56 +00003481PyObject *
3482PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003483 Py_ssize_t size,
3484 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003485{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003486 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3487}
3488
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489/* The decoder. The only state we preserve is our read position,
3490 * i.e. how many characters we have consumed. So if we end in the
3491 * middle of a shift sequence we have to back off the read position
3492 * and the output to the beginning of the sequence, otherwise we lose
3493 * all the shift state (seen bits, number of bits seen, high
3494 * surrogate). */
3495
Alexander Belopolsky40018472011-02-26 01:02:56 +00003496PyObject *
3497PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003498 Py_ssize_t size,
3499 const char *errors,
3500 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t startinpos;
3504 Py_ssize_t endinpos;
3505 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003506 const char *e;
3507 PyUnicodeObject *unicode;
3508 Py_UNICODE *p;
3509 const char *errmsg = "";
3510 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511 Py_UNICODE *shiftOutStart;
3512 unsigned int base64bits = 0;
3513 unsigned long base64buffer = 0;
3514 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 PyObject *errorHandler = NULL;
3516 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003517
3518 unicode = _PyUnicode_New(size);
3519 if (!unicode)
3520 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003521 if (size == 0) {
3522 if (consumed)
3523 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003524 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003528 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003529 e = s + size;
3530
3531 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003534 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003535
Antoine Pitrou244651a2009-05-04 18:56:13 +00003536 if (inShift) { /* in a base-64 section */
3537 if (IS_BASE64(ch)) { /* consume a base-64 character */
3538 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3539 base64bits += 6;
3540 s++;
3541 if (base64bits >= 16) {
3542 /* we have enough bits for a UTF-16 value */
3543 Py_UNICODE outCh = (Py_UNICODE)
3544 (base64buffer >> (base64bits-16));
3545 base64bits -= 16;
3546 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3547 if (surrogate) {
3548 /* expecting a second surrogate */
3549 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3550#ifdef Py_UNICODE_WIDE
3551 *p++ = (((surrogate & 0x3FF)<<10)
3552 | (outCh & 0x3FF)) + 0x10000;
3553#else
3554 *p++ = surrogate;
3555 *p++ = outCh;
3556#endif
3557 surrogate = 0;
3558 }
3559 else {
3560 surrogate = 0;
3561 errmsg = "second surrogate missing";
3562 goto utf7Error;
3563 }
3564 }
3565 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3566 /* first surrogate */
3567 surrogate = outCh;
3568 }
3569 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3570 errmsg = "unexpected second surrogate";
3571 goto utf7Error;
3572 }
3573 else {
3574 *p++ = outCh;
3575 }
3576 }
3577 }
3578 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003579 inShift = 0;
3580 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003581 if (surrogate) {
3582 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003583 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003585 if (base64bits > 0) { /* left-over bits */
3586 if (base64bits >= 6) {
3587 /* We've seen at least one base-64 character */
3588 errmsg = "partial character in shift sequence";
3589 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003591 else {
3592 /* Some bits remain; they should be zero */
3593 if (base64buffer != 0) {
3594 errmsg = "non-zero padding bits in shift sequence";
3595 goto utf7Error;
3596 }
3597 }
3598 }
3599 if (ch != '-') {
3600 /* '-' is absorbed; other terminating
3601 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003602 *p++ = ch;
3603 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 }
3605 }
3606 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003608 s++; /* consume '+' */
3609 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 s++;
3611 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003612 }
3613 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 shiftOutStart = p;
3616 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003617 }
3618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 *p++ = ch;
3621 s++;
3622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 else {
3624 startinpos = s-starts;
3625 s++;
3626 errmsg = "unexpected special character";
3627 goto utf7Error;
3628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003629 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003630utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 outpos = p-PyUnicode_AS_UNICODE(unicode);
3632 endinpos = s-starts;
3633 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 errors, &errorHandler,
3635 "utf7", errmsg,
3636 &starts, &e, &startinpos, &endinpos, &exc, &s,
3637 &unicode, &outpos, &p))
3638 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003639 }
3640
Antoine Pitrou244651a2009-05-04 18:56:13 +00003641 /* end of string */
3642
3643 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3644 /* if we're in an inconsistent state, that's an error */
3645 if (surrogate ||
3646 (base64bits >= 6) ||
3647 (base64bits > 0 && base64buffer != 0)) {
3648 outpos = p-PyUnicode_AS_UNICODE(unicode);
3649 endinpos = size;
3650 if (unicode_decode_call_errorhandler(
3651 errors, &errorHandler,
3652 "utf7", "unterminated shift sequence",
3653 &starts, &e, &startinpos, &endinpos, &exc, &s,
3654 &unicode, &outpos, &p))
3655 goto onError;
3656 if (s < e)
3657 goto restart;
3658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660
3661 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003662 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003663 if (inShift) {
3664 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003665 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003666 }
3667 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003668 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003670 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671
Victor Stinnerfe226c02011-10-03 03:52:20 +02003672 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673 goto onError;
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003677 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678 Py_DECREF(unicode);
3679 return NULL;
3680 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003681 return (PyObject *)unicode;
3682
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003686 Py_DECREF(unicode);
3687 return NULL;
3688}
3689
3690
Alexander Belopolsky40018472011-02-26 01:02:56 +00003691PyObject *
3692PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003693 Py_ssize_t size,
3694 int base64SetO,
3695 int base64WhiteSpace,
3696 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003698 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003700 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003702 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003703 unsigned int base64bits = 0;
3704 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003705 char * out;
3706 char * start;
3707
3708 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003710
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003711 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003712 return PyErr_NoMemory();
3713
Antoine Pitrou244651a2009-05-04 18:56:13 +00003714 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003715 if (v == NULL)
3716 return NULL;
3717
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003718 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719 for (;i < size; ++i) {
3720 Py_UNICODE ch = s[i];
3721
Antoine Pitrou244651a2009-05-04 18:56:13 +00003722 if (inShift) {
3723 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3724 /* shifting out */
3725 if (base64bits) { /* output remaining bits */
3726 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3727 base64buffer = 0;
3728 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 }
3730 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003731 /* Characters not in the BASE64 set implicitly unshift the sequence
3732 so no '-' is required, except if the character is itself a '-' */
3733 if (IS_BASE64(ch) || ch == '-') {
3734 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003735 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003736 *out++ = (char) ch;
3737 }
3738 else {
3739 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003740 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003741 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 else { /* not in a shift sequence */
3743 if (ch == '+') {
3744 *out++ = '+';
3745 *out++ = '-';
3746 }
3747 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3748 *out++ = (char) ch;
3749 }
3750 else {
3751 *out++ = '+';
3752 inShift = 1;
3753 goto encode_char;
3754 }
3755 }
3756 continue;
3757encode_char:
3758#ifdef Py_UNICODE_WIDE
3759 if (ch >= 0x10000) {
3760 /* code first surrogate */
3761 base64bits += 16;
3762 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3763 while (base64bits >= 6) {
3764 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3765 base64bits -= 6;
3766 }
3767 /* prepare second surrogate */
3768 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3769 }
3770#endif
3771 base64bits += 16;
3772 base64buffer = (base64buffer << 16) | ch;
3773 while (base64bits >= 6) {
3774 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3775 base64bits -= 6;
3776 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003777 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778 if (base64bits)
3779 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3780 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003782 if (_PyBytes_Resize(&v, out - start) < 0)
3783 return NULL;
3784 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785}
3786
Antoine Pitrou244651a2009-05-04 18:56:13 +00003787#undef IS_BASE64
3788#undef FROM_BASE64
3789#undef TO_BASE64
3790#undef DECODE_DIRECT
3791#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793/* --- UTF-8 Codec -------------------------------------------------------- */
3794
Tim Petersced69f82003-09-16 20:30:58 +00003795static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003797 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3798 illegal prefix. See RFC 3629 for details */
3799 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3800 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003801 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3803 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3804 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3805 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003806 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3807 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003810 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3811 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3812 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3813 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3814 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815};
3816
Alexander Belopolsky40018472011-02-26 01:02:56 +00003817PyObject *
3818PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003819 Py_ssize_t size,
3820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald69652032004-09-07 20:24:22 +00003822 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3823}
3824
Antoine Pitrouab868312009-01-10 15:40:25 +00003825/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3826#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3827
3828/* Mask to quickly check whether a C 'long' contains a
3829 non-ASCII, UTF8-encoded char. */
3830#if (SIZEOF_LONG == 8)
3831# define ASCII_CHAR_MASK 0x8080808080808080L
3832#elif (SIZEOF_LONG == 4)
3833# define ASCII_CHAR_MASK 0x80808080L
3834#else
3835# error C 'long' size should be either 4 or 8!
3836#endif
3837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838/* Scans a UTF-8 string and returns the maximum character to be expected,
3839 the size of the decoded unicode string and if any major errors were
3840 encountered.
3841
3842 This function does check basic UTF-8 sanity, it does however NOT CHECK
3843 if the string contains surrogates, and if all continuation bytes are
3844 within the correct ranges, these checks are performed in
3845 PyUnicode_DecodeUTF8Stateful.
3846
3847 If it sets has_errors to 1, it means the value of unicode_size and max_char
3848 will be bogus and you should not rely on useful information in them.
3849 */
3850static Py_UCS4
3851utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3852 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3853 int *has_errors)
3854{
3855 Py_ssize_t n;
3856 Py_ssize_t char_count = 0;
3857 Py_UCS4 max_char = 127, new_max;
3858 Py_UCS4 upper_bound;
3859 const unsigned char *p = (const unsigned char *)s;
3860 const unsigned char *end = p + string_size;
3861 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3862 int err = 0;
3863
3864 for (; p < end && !err; ++p, ++char_count) {
3865 /* Only check value if it's not a ASCII char... */
3866 if (*p < 0x80) {
3867 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3868 an explanation. */
3869 if (!((size_t) p & LONG_PTR_MASK)) {
3870 /* Help register allocation */
3871 register const unsigned char *_p = p;
3872 while (_p < aligned_end) {
3873 unsigned long value = *(unsigned long *) _p;
3874 if (value & ASCII_CHAR_MASK)
3875 break;
3876 _p += SIZEOF_LONG;
3877 char_count += SIZEOF_LONG;
3878 }
3879 p = _p;
3880 if (p == end)
3881 break;
3882 }
3883 }
3884 if (*p >= 0x80) {
3885 n = utf8_code_length[*p];
3886 new_max = max_char;
3887 switch (n) {
3888 /* invalid start byte */
3889 case 0:
3890 err = 1;
3891 break;
3892 case 2:
3893 /* Code points between 0x00FF and 0x07FF inclusive.
3894 Approximate the upper bound of the code point,
3895 if this flips over 255 we can be sure it will be more
3896 than 255 and the string will need 2 bytes per code coint,
3897 if it stays under or equal to 255, we can be sure 1 byte
3898 is enough.
3899 ((*p & 0b00011111) << 6) | 0b00111111 */
3900 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3901 if (max_char < upper_bound)
3902 new_max = upper_bound;
3903 /* Ensure we track at least that we left ASCII space. */
3904 if (new_max < 128)
3905 new_max = 128;
3906 break;
3907 case 3:
3908 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3909 always > 255 and <= 65535 and will always need 2 bytes. */
3910 if (max_char < 65535)
3911 new_max = 65535;
3912 break;
3913 case 4:
3914 /* Code point will be above 0xFFFF for sure in this case. */
3915 new_max = 65537;
3916 break;
3917 /* Internal error, this should be caught by the first if */
3918 case 1:
3919 default:
3920 assert(0 && "Impossible case in utf8_max_char_and_size");
3921 err = 1;
3922 }
3923 /* Instead of number of overall bytes for this code point,
3924 n containts the number of following bytes: */
3925 --n;
3926 /* Check if the follow up chars are all valid continuation bytes */
3927 if (n >= 1) {
3928 const unsigned char *cont;
3929 if ((p + n) >= end) {
3930 if (consumed == 0)
3931 /* incomplete data, non-incremental decoding */
3932 err = 1;
3933 break;
3934 }
3935 for (cont = p + 1; cont < (p + n); ++cont) {
3936 if ((*cont & 0xc0) != 0x80) {
3937 err = 1;
3938 break;
3939 }
3940 }
3941 p += n;
3942 }
3943 else
3944 err = 1;
3945 max_char = new_max;
3946 }
3947 }
3948
3949 if (unicode_size)
3950 *unicode_size = char_count;
3951 if (has_errors)
3952 *has_errors = err;
3953 return max_char;
3954}
3955
3956/* Similar to PyUnicode_WRITE but can also write into wstr field
3957 of the legacy unicode representation */
3958#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3959 do { \
3960 const int k_ = (kind); \
3961 if (k_ == PyUnicode_WCHAR_KIND) \
3962 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3963 else if (k_ == PyUnicode_1BYTE_KIND) \
3964 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3965 else if (k_ == PyUnicode_2BYTE_KIND) \
3966 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3967 else \
3968 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3969 } while (0)
3970
Alexander Belopolsky40018472011-02-26 01:02:56 +00003971PyObject *
3972PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 Py_ssize_t size,
3974 const char *errors,
3975 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003979 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t startinpos;
3981 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003982 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003984 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 PyObject *errorHandler = NULL;
3986 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 Py_UCS4 maxchar = 0;
3988 Py_ssize_t unicode_size;
3989 Py_ssize_t i;
3990 int kind;
3991 void *data;
3992 int has_errors;
3993 Py_UNICODE *error_outptr;
3994#if SIZEOF_WCHAR_T == 2
3995 Py_ssize_t wchar_offset = 0;
3996#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997
Walter Dörwald69652032004-09-07 20:24:22 +00003998 if (size == 0) {
3999 if (consumed)
4000 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4004 consumed, &has_errors);
4005 if (has_errors) {
4006 unicode = _PyUnicode_New(size);
4007 if (!unicode)
4008 return NULL;
4009 kind = PyUnicode_WCHAR_KIND;
4010 data = PyUnicode_AS_UNICODE(unicode);
4011 assert(data != NULL);
4012 }
4013 else {
4014 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4015 if (!unicode)
4016 return NULL;
4017 /* When the string is ASCII only, just use memcpy and return.
4018 unicode_size may be != size if there is an incomplete UTF-8
4019 sequence at the end of the ASCII block. */
4020 if (maxchar < 128 && size == unicode_size) {
4021 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4022 return (PyObject *)unicode;
4023 }
4024 kind = PyUnicode_KIND(unicode);
4025 data = PyUnicode_DATA(unicode);
4026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004030 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031
4032 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004033 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034
4035 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004036 /* Fast path for runs of ASCII characters. Given that common UTF-8
4037 input will consist of an overwhelming majority of ASCII
4038 characters, we try to optimize for this case by checking
4039 as many characters as a C 'long' can contain.
4040 First, check if we can do an aligned read, as most CPUs have
4041 a penalty for unaligned reads.
4042 */
4043 if (!((size_t) s & LONG_PTR_MASK)) {
4044 /* Help register allocation */
4045 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004047 while (_s < aligned_end) {
4048 /* Read a whole long at a time (either 4 or 8 bytes),
4049 and do a fast unrolled copy if it only contains ASCII
4050 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 unsigned long value = *(unsigned long *) _s;
4052 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004053 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4055 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4056 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4057 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004058#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4060 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4061 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4062 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004063#endif
4064 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004066 }
4067 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004069 if (s == e)
4070 break;
4071 ch = (unsigned char)*s;
4072 }
4073 }
4074
4075 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 s++;
4078 continue;
4079 }
4080
4081 n = utf8_code_length[ch];
4082
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004083 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 if (consumed)
4085 break;
4086 else {
4087 errmsg = "unexpected end of data";
4088 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004089 endinpos = startinpos+1;
4090 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4091 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 goto utf8Error;
4093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095
4096 switch (n) {
4097
4098 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004099 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 startinpos = s-starts;
4101 endinpos = startinpos+1;
4102 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103
4104 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004105 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 startinpos = s-starts;
4107 endinpos = startinpos+1;
4108 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109
4110 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004111 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004112 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004114 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 goto utf8Error;
4116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004118 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 break;
4121
4122 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004123 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4124 will result in surrogates in range d800-dfff. Surrogates are
4125 not valid UTF-8 so they are rejected.
4126 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4127 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004128 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004129 (s[2] & 0xc0) != 0x80 ||
4130 ((unsigned char)s[0] == 0xE0 &&
4131 (unsigned char)s[1] < 0xA0) ||
4132 ((unsigned char)s[0] == 0xED &&
4133 (unsigned char)s[1] > 0x9F)) {
4134 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004136 endinpos = startinpos + 1;
4137
4138 /* if s[1] first two bits are 1 and 0, then the invalid
4139 continuation byte is s[2], so increment endinpos by 1,
4140 if not, s[1] is invalid and endinpos doesn't need to
4141 be incremented. */
4142 if ((s[1] & 0xC0) == 0x80)
4143 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 goto utf8Error;
4145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004147 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004149 break;
4150
4151 case 4:
4152 if ((s[1] & 0xc0) != 0x80 ||
4153 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004154 (s[3] & 0xc0) != 0x80 ||
4155 ((unsigned char)s[0] == 0xF0 &&
4156 (unsigned char)s[1] < 0x90) ||
4157 ((unsigned char)s[0] == 0xF4 &&
4158 (unsigned char)s[1] > 0x8F)) {
4159 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004161 endinpos = startinpos + 1;
4162 if ((s[1] & 0xC0) == 0x80) {
4163 endinpos++;
4164 if ((s[2] & 0xC0) == 0x80)
4165 endinpos++;
4166 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto utf8Error;
4168 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004169 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004170 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4171 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 /* If the string is flexible or we have native UCS-4, write
4174 directly.. */
4175 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4176 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 else {
4179 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 /* translate from 10000..10FFFF to 0..FFFF */
4182 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 /* high surrogate = top 10 bits added to D800 */
4185 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4186 (Py_UNICODE)(0xD800 + (ch >> 10)));
4187
4188 /* low surrogate = bottom 10 bits added to DC00 */
4189 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4190 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4191 }
4192#if SIZEOF_WCHAR_T == 2
4193 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004194#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 }
4197 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004199
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 /* If this is not yet a resizable string, make it one.. */
4202 if (kind != PyUnicode_WCHAR_KIND) {
4203 const Py_UNICODE *u;
4204 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4205 if (!new_unicode)
4206 goto onError;
4207 u = PyUnicode_AsUnicode((PyObject *)unicode);
4208 if (!u)
4209 goto onError;
4210#if SIZEOF_WCHAR_T == 2
4211 i += wchar_offset;
4212#endif
4213 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4214 Py_DECREF(unicode);
4215 unicode = new_unicode;
4216 kind = 0;
4217 data = PyUnicode_AS_UNICODE(new_unicode);
4218 assert(data != NULL);
4219 }
4220 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 if (unicode_decode_call_errorhandler(
4222 errors, &errorHandler,
4223 "utf8", errmsg,
4224 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 /* Update data because unicode_decode_call_errorhandler might have
4228 re-created or resized the unicode object. */
4229 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 /* Ensure the unicode_size calculation above was correct: */
4233 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4234
Walter Dörwald69652032004-09-07 20:24:22 +00004235 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 /* Adjust length and ready string when it contained errors and
4239 is of the old resizable kind. */
4240 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004241 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 goto onError;
4243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 Py_XDECREF(errorHandler);
4246 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004247 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 Py_DECREF(unicode);
4249 return NULL;
4250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 return (PyObject *)unicode;
4252
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 Py_XDECREF(errorHandler);
4255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 Py_DECREF(unicode);
4257 return NULL;
4258}
4259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004261
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004262#ifdef __APPLE__
4263
4264/* Simplified UTF-8 decoder using surrogateescape error handler,
4265 used to decode the command line arguments on Mac OS X. */
4266
4267wchar_t*
4268_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4269{
4270 int n;
4271 const char *e;
4272 wchar_t *unicode, *p;
4273
4274 /* Note: size will always be longer than the resulting Unicode
4275 character count */
4276 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4277 PyErr_NoMemory();
4278 return NULL;
4279 }
4280 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4281 if (!unicode)
4282 return NULL;
4283
4284 /* Unpack UTF-8 encoded data */
4285 p = unicode;
4286 e = s + size;
4287 while (s < e) {
4288 Py_UCS4 ch = (unsigned char)*s;
4289
4290 if (ch < 0x80) {
4291 *p++ = (wchar_t)ch;
4292 s++;
4293 continue;
4294 }
4295
4296 n = utf8_code_length[ch];
4297 if (s + n > e) {
4298 goto surrogateescape;
4299 }
4300
4301 switch (n) {
4302 case 0:
4303 case 1:
4304 goto surrogateescape;
4305
4306 case 2:
4307 if ((s[1] & 0xc0) != 0x80)
4308 goto surrogateescape;
4309 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4310 assert ((ch > 0x007F) && (ch <= 0x07FF));
4311 *p++ = (wchar_t)ch;
4312 break;
4313
4314 case 3:
4315 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4316 will result in surrogates in range d800-dfff. Surrogates are
4317 not valid UTF-8 so they are rejected.
4318 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4319 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4320 if ((s[1] & 0xc0) != 0x80 ||
4321 (s[2] & 0xc0) != 0x80 ||
4322 ((unsigned char)s[0] == 0xE0 &&
4323 (unsigned char)s[1] < 0xA0) ||
4324 ((unsigned char)s[0] == 0xED &&
4325 (unsigned char)s[1] > 0x9F)) {
4326
4327 goto surrogateescape;
4328 }
4329 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4330 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004332 break;
4333
4334 case 4:
4335 if ((s[1] & 0xc0) != 0x80 ||
4336 (s[2] & 0xc0) != 0x80 ||
4337 (s[3] & 0xc0) != 0x80 ||
4338 ((unsigned char)s[0] == 0xF0 &&
4339 (unsigned char)s[1] < 0x90) ||
4340 ((unsigned char)s[0] == 0xF4 &&
4341 (unsigned char)s[1] > 0x8F)) {
4342 goto surrogateescape;
4343 }
4344 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4345 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4346 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4347
4348#if SIZEOF_WCHAR_T == 4
4349 *p++ = (wchar_t)ch;
4350#else
4351 /* compute and append the two surrogates: */
4352
4353 /* translate from 10000..10FFFF to 0..FFFF */
4354 ch -= 0x10000;
4355
4356 /* high surrogate = top 10 bits added to D800 */
4357 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4358
4359 /* low surrogate = bottom 10 bits added to DC00 */
4360 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4361#endif
4362 break;
4363 }
4364 s += n;
4365 continue;
4366
4367 surrogateescape:
4368 *p++ = 0xDC00 + ch;
4369 s++;
4370 }
4371 *p = L'\0';
4372 return unicode;
4373}
4374
4375#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377/* Primary internal function which creates utf8 encoded bytes objects.
4378
4379 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004380 and allocate exactly as much space needed at the end. Else allocate the
4381 maximum possible needed (4 result bytes per Unicode character), and return
4382 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004383*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004384PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004385_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
Tim Peters602f7402002-04-27 18:03:26 +00004387#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004388
Guido van Rossum98297ee2007-11-06 21:34:58 +00004389 Py_ssize_t i; /* index into s of next input byte */
4390 PyObject *result; /* result string object */
4391 char *p; /* next free byte in output buffer */
4392 Py_ssize_t nallocated; /* number of result bytes allocated */
4393 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004394 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004395 PyObject *errorHandler = NULL;
4396 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397 int kind;
4398 void *data;
4399 Py_ssize_t size;
4400 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4401#if SIZEOF_WCHAR_T == 2
4402 Py_ssize_t wchar_offset = 0;
4403#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004405 if (!PyUnicode_Check(unicode)) {
4406 PyErr_BadArgument();
4407 return NULL;
4408 }
4409
4410 if (PyUnicode_READY(unicode) == -1)
4411 return NULL;
4412
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004413 if (PyUnicode_UTF8(unicode))
4414 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4415 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416
4417 kind = PyUnicode_KIND(unicode);
4418 data = PyUnicode_DATA(unicode);
4419 size = PyUnicode_GET_LENGTH(unicode);
4420
Tim Peters602f7402002-04-27 18:03:26 +00004421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
Tim Peters602f7402002-04-27 18:03:26 +00004423 if (size <= MAX_SHORT_UNICHARS) {
4424 /* Write into the stack buffer; nallocated can't overflow.
4425 * At the end, we'll allocate exactly as much heap space as it
4426 * turns out we need.
4427 */
4428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004429 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004430 p = stackbuf;
4431 }
4432 else {
4433 /* Overallocate on the heap, and give the excess back at the end. */
4434 nallocated = size * 4;
4435 if (nallocated / 4 != size) /* overflow! */
4436 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004437 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004438 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004439 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004440 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004442
Tim Peters602f7402002-04-27 18:03:26 +00004443 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004444 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004452 *p++ = (char)(0xc0 | (ch >> 6));
4453 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004454 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 Py_ssize_t newpos;
4456 PyObject *rep;
4457 Py_ssize_t repsize, k, startpos;
4458 startpos = i-1;
4459#if SIZEOF_WCHAR_T == 2
4460 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004461#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 rep = unicode_encode_call_errorhandler(
4463 errors, &errorHandler, "utf-8", "surrogates not allowed",
4464 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4465 &exc, startpos, startpos+1, &newpos);
4466 if (!rep)
4467 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 if (PyBytes_Check(rep))
4470 repsize = PyBytes_GET_SIZE(rep);
4471 else
4472 repsize = PyUnicode_GET_SIZE(rep);
4473
4474 if (repsize > 4) {
4475 Py_ssize_t offset;
4476
4477 if (result == NULL)
4478 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004479 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004480 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4483 /* integer overflow */
4484 PyErr_NoMemory();
4485 goto error;
4486 }
4487 nallocated += repsize - 4;
4488 if (result != NULL) {
4489 if (_PyBytes_Resize(&result, nallocated) < 0)
4490 goto error;
4491 } else {
4492 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004493 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 goto error;
4495 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4496 }
4497 p = PyBytes_AS_STRING(result) + offset;
4498 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 if (PyBytes_Check(rep)) {
4501 char *prep = PyBytes_AS_STRING(rep);
4502 for(k = repsize; k > 0; k--)
4503 *p++ = *prep++;
4504 } else /* rep is unicode */ {
4505 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4506 Py_UNICODE c;
4507
4508 for(k=0; k<repsize; k++) {
4509 c = prep[k];
4510 if (0x80 <= c) {
4511 raise_encode_exception(&exc, "utf-8",
4512 PyUnicode_AS_UNICODE(unicode),
4513 size, i-1, i,
4514 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004515 goto error;
4516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004518 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004521 } else if (ch < 0x10000) {
4522 *p++ = (char)(0xe0 | (ch >> 12));
4523 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4524 *p++ = (char)(0x80 | (ch & 0x3f));
4525 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004526 /* Encode UCS4 Unicode ordinals */
4527 *p++ = (char)(0xf0 | (ch >> 18));
4528 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4529 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4530 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531#if SIZEOF_WCHAR_T == 2
4532 wchar_offset++;
4533#endif
Tim Peters602f7402002-04-27 18:03:26 +00004534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004536
Guido van Rossum98297ee2007-11-06 21:34:58 +00004537 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004538 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004539 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004540 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004541 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004542 }
4543 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004544 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004545 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004546 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004547 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004549
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004550 Py_XDECREF(errorHandler);
4551 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004552 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004553 error:
4554 Py_XDECREF(errorHandler);
4555 Py_XDECREF(exc);
4556 Py_XDECREF(result);
4557 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004558
Tim Peters602f7402002-04-27 18:03:26 +00004559#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560}
4561
Alexander Belopolsky40018472011-02-26 01:02:56 +00004562PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004563PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4564 Py_ssize_t size,
4565 const char *errors)
4566{
4567 PyObject *v, *unicode;
4568
4569 unicode = PyUnicode_FromUnicode(s, size);
4570 if (unicode == NULL)
4571 return NULL;
4572 v = _PyUnicode_AsUTF8String(unicode, errors);
4573 Py_DECREF(unicode);
4574 return v;
4575}
4576
4577PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004578PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581}
4582
Walter Dörwald41980ca2007-08-16 21:55:45 +00004583/* --- UTF-32 Codec ------------------------------------------------------- */
4584
4585PyObject *
4586PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_ssize_t size,
4588 const char *errors,
4589 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004590{
4591 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4592}
4593
4594PyObject *
4595PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 Py_ssize_t size,
4597 const char *errors,
4598 int *byteorder,
4599 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004600{
4601 const char *starts = s;
4602 Py_ssize_t startinpos;
4603 Py_ssize_t endinpos;
4604 Py_ssize_t outpos;
4605 PyUnicodeObject *unicode;
4606 Py_UNICODE *p;
4607#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004608 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004609 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610#else
4611 const int pairs = 0;
4612#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004613 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004614 int bo = 0; /* assume native ordering by default */
4615 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004616 /* Offsets from q for retrieving bytes in the right order. */
4617#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4618 int iorder[] = {0, 1, 2, 3};
4619#else
4620 int iorder[] = {3, 2, 1, 0};
4621#endif
4622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004624
Walter Dörwald41980ca2007-08-16 21:55:45 +00004625 q = (unsigned char *)s;
4626 e = q + size;
4627
4628 if (byteorder)
4629 bo = *byteorder;
4630
4631 /* Check for BOM marks (U+FEFF) in the input and adjust current
4632 byte order setting accordingly. In native mode, the leading BOM
4633 mark is skipped, in all other modes, it is copied to the output
4634 stream as-is (giving a ZWNBSP character). */
4635 if (bo == 0) {
4636 if (size >= 4) {
4637 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 if (bom == 0x0000FEFF) {
4641 q += 4;
4642 bo = -1;
4643 }
4644 else if (bom == 0xFFFE0000) {
4645 q += 4;
4646 bo = 1;
4647 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 if (bom == 0x0000FEFF) {
4650 q += 4;
4651 bo = 1;
4652 }
4653 else if (bom == 0xFFFE0000) {
4654 q += 4;
4655 bo = -1;
4656 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004657#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659 }
4660
4661 if (bo == -1) {
4662 /* force LE */
4663 iorder[0] = 0;
4664 iorder[1] = 1;
4665 iorder[2] = 2;
4666 iorder[3] = 3;
4667 }
4668 else if (bo == 1) {
4669 /* force BE */
4670 iorder[0] = 3;
4671 iorder[1] = 2;
4672 iorder[2] = 1;
4673 iorder[3] = 0;
4674 }
4675
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004676 /* On narrow builds we split characters outside the BMP into two
4677 codepoints => count how much extra space we need. */
4678#ifndef Py_UNICODE_WIDE
4679 for (qq = q; qq < e; qq += 4)
4680 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4681 pairs++;
4682#endif
4683
4684 /* This might be one to much, because of a BOM */
4685 unicode = _PyUnicode_New((size+3)/4+pairs);
4686 if (!unicode)
4687 return NULL;
4688 if (size == 0)
4689 return (PyObject *)unicode;
4690
4691 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004693
Walter Dörwald41980ca2007-08-16 21:55:45 +00004694 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 Py_UCS4 ch;
4696 /* remaining bytes at the end? (size should be divisible by 4) */
4697 if (e-q<4) {
4698 if (consumed)
4699 break;
4700 errmsg = "truncated data";
4701 startinpos = ((const char *)q)-starts;
4702 endinpos = ((const char *)e)-starts;
4703 goto utf32Error;
4704 /* The remaining input chars are ignored if the callback
4705 chooses to skip the input */
4706 }
4707 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4708 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004709
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 if (ch >= 0x110000)
4711 {
4712 errmsg = "codepoint not in range(0x110000)";
4713 startinpos = ((const char *)q)-starts;
4714 endinpos = startinpos+4;
4715 goto utf32Error;
4716 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004717#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 if (ch >= 0x10000)
4719 {
4720 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4721 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4722 }
4723 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 *p++ = ch;
4726 q += 4;
4727 continue;
4728 utf32Error:
4729 outpos = p-PyUnicode_AS_UNICODE(unicode);
4730 if (unicode_decode_call_errorhandler(
4731 errors, &errorHandler,
4732 "utf32", errmsg,
4733 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4734 &unicode, &outpos, &p))
4735 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004736 }
4737
4738 if (byteorder)
4739 *byteorder = bo;
4740
4741 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004743
4744 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004745 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746 goto onError;
4747
4748 Py_XDECREF(errorHandler);
4749 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004750 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751 Py_DECREF(unicode);
4752 return NULL;
4753 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004754 return (PyObject *)unicode;
4755
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757 Py_DECREF(unicode);
4758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
4760 return NULL;
4761}
4762
4763PyObject *
4764PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 Py_ssize_t size,
4766 const char *errors,
4767 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004768{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004770 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004771 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004772#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004773 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004774#else
4775 const int pairs = 0;
4776#endif
4777 /* Offsets from p for storing byte pairs in the right order. */
4778#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4779 int iorder[] = {0, 1, 2, 3};
4780#else
4781 int iorder[] = {3, 2, 1, 0};
4782#endif
4783
Benjamin Peterson29060642009-01-31 22:14:21 +00004784#define STORECHAR(CH) \
4785 do { \
4786 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4787 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4788 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4789 p[iorder[0]] = (CH) & 0xff; \
4790 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004791 } while(0)
4792
4793 /* In narrow builds we can output surrogate pairs as one codepoint,
4794 so we need less space. */
4795#ifndef Py_UNICODE_WIDE
4796 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4798 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4799 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004800#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004801 nsize = (size - pairs + (byteorder == 0));
4802 bytesize = nsize * 4;
4803 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004805 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806 if (v == NULL)
4807 return NULL;
4808
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004809 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004813 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004814
4815 if (byteorder == -1) {
4816 /* force LE */
4817 iorder[0] = 0;
4818 iorder[1] = 1;
4819 iorder[2] = 2;
4820 iorder[3] = 3;
4821 }
4822 else if (byteorder == 1) {
4823 /* force BE */
4824 iorder[0] = 3;
4825 iorder[1] = 2;
4826 iorder[2] = 1;
4827 iorder[3] = 0;
4828 }
4829
4830 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004832#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4834 Py_UCS4 ch2 = *s;
4835 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4836 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4837 s++;
4838 size--;
4839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841#endif
4842 STORECHAR(ch);
4843 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004844
4845 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004846 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004847#undef STORECHAR
4848}
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004852{
4853 if (!PyUnicode_Check(unicode)) {
4854 PyErr_BadArgument();
4855 return NULL;
4856 }
4857 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 PyUnicode_GET_SIZE(unicode),
4859 NULL,
4860 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004861}
4862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863/* --- UTF-16 Codec ------------------------------------------------------- */
4864
Tim Peters772747b2001-08-09 22:21:55 +00004865PyObject *
4866PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_ssize_t size,
4868 const char *errors,
4869 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald69652032004-09-07 20:24:22 +00004871 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4872}
4873
Antoine Pitrouab868312009-01-10 15:40:25 +00004874/* Two masks for fast checking of whether a C 'long' may contain
4875 UTF16-encoded surrogate characters. This is an efficient heuristic,
4876 assuming that non-surrogate characters with a code point >= 0x8000 are
4877 rare in most input.
4878 FAST_CHAR_MASK is used when the input is in native byte ordering,
4879 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004880*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004881#if (SIZEOF_LONG == 8)
4882# define FAST_CHAR_MASK 0x8000800080008000L
4883# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4884#elif (SIZEOF_LONG == 4)
4885# define FAST_CHAR_MASK 0x80008000L
4886# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4887#else
4888# error C 'long' size should be either 4 or 8!
4889#endif
4890
Walter Dörwald69652032004-09-07 20:24:22 +00004891PyObject *
4892PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 Py_ssize_t size,
4894 const char *errors,
4895 int *byteorder,
4896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t startinpos;
4900 Py_ssize_t endinpos;
4901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 PyUnicodeObject *unicode;
4903 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004904 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004905 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004906 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004907 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004908 /* Offsets from q for retrieving byte pairs in the right order. */
4909#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4910 int ihi = 1, ilo = 0;
4911#else
4912 int ihi = 0, ilo = 1;
4913#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 PyObject *errorHandler = NULL;
4915 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916
4917 /* Note: size will always be longer than the resulting Unicode
4918 character count */
4919 unicode = _PyUnicode_New(size);
4920 if (!unicode)
4921 return NULL;
4922 if (size == 0)
4923 return (PyObject *)unicode;
4924
4925 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004926 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004927 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004928 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929
4930 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004931 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004933 /* Check for BOM marks (U+FEFF) in the input and adjust current
4934 byte order setting accordingly. In native mode, the leading BOM
4935 mark is skipped, in all other modes, it is copied to the output
4936 stream as-is (giving a ZWNBSP character). */
4937 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004938 if (size >= 2) {
4939 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004940#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 if (bom == 0xFEFF) {
4942 q += 2;
4943 bo = -1;
4944 }
4945 else if (bom == 0xFFFE) {
4946 q += 2;
4947 bo = 1;
4948 }
Tim Petersced69f82003-09-16 20:30:58 +00004949#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004950 if (bom == 0xFEFF) {
4951 q += 2;
4952 bo = 1;
4953 }
4954 else if (bom == 0xFFFE) {
4955 q += 2;
4956 bo = -1;
4957 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004958#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961
Tim Peters772747b2001-08-09 22:21:55 +00004962 if (bo == -1) {
4963 /* force LE */
4964 ihi = 1;
4965 ilo = 0;
4966 }
4967 else if (bo == 1) {
4968 /* force BE */
4969 ihi = 0;
4970 ilo = 1;
4971 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004972#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4973 native_ordering = ilo < ihi;
4974#else
4975 native_ordering = ilo > ihi;
4976#endif
Tim Peters772747b2001-08-09 22:21:55 +00004977
Antoine Pitrouab868312009-01-10 15:40:25 +00004978 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004979 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004981 /* First check for possible aligned read of a C 'long'. Unaligned
4982 reads are more expensive, better to defer to another iteration. */
4983 if (!((size_t) q & LONG_PTR_MASK)) {
4984 /* Fast path for runs of non-surrogate chars. */
4985 register const unsigned char *_q = q;
4986 Py_UNICODE *_p = p;
4987 if (native_ordering) {
4988 /* Native ordering is simple: as long as the input cannot
4989 possibly contain a surrogate char, do an unrolled copy
4990 of several 16-bit code points to the target object.
4991 The non-surrogate check is done on several input bytes
4992 at a time (as many as a C 'long' can contain). */
4993 while (_q < aligned_end) {
4994 unsigned long data = * (unsigned long *) _q;
4995 if (data & FAST_CHAR_MASK)
4996 break;
4997 _p[0] = ((unsigned short *) _q)[0];
4998 _p[1] = ((unsigned short *) _q)[1];
4999#if (SIZEOF_LONG == 8)
5000 _p[2] = ((unsigned short *) _q)[2];
5001 _p[3] = ((unsigned short *) _q)[3];
5002#endif
5003 _q += SIZEOF_LONG;
5004 _p += SIZEOF_LONG / 2;
5005 }
5006 }
5007 else {
5008 /* Byteswapped ordering is similar, but we must decompose
5009 the copy bytewise, and take care of zero'ing out the
5010 upper bytes if the target object is in 32-bit units
5011 (that is, in UCS-4 builds). */
5012 while (_q < aligned_end) {
5013 unsigned long data = * (unsigned long *) _q;
5014 if (data & SWAPPED_FAST_CHAR_MASK)
5015 break;
5016 /* Zero upper bytes in UCS-4 builds */
5017#if (Py_UNICODE_SIZE > 2)
5018 _p[0] = 0;
5019 _p[1] = 0;
5020#if (SIZEOF_LONG == 8)
5021 _p[2] = 0;
5022 _p[3] = 0;
5023#endif
5024#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005025 /* Issue #4916; UCS-4 builds on big endian machines must
5026 fill the two last bytes of each 4-byte unit. */
5027#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5028# define OFF 2
5029#else
5030# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005031#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005032 ((unsigned char *) _p)[OFF + 1] = _q[0];
5033 ((unsigned char *) _p)[OFF + 0] = _q[1];
5034 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5035 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5036#if (SIZEOF_LONG == 8)
5037 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5038 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5039 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5040 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5041#endif
5042#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005043 _q += SIZEOF_LONG;
5044 _p += SIZEOF_LONG / 2;
5045 }
5046 }
5047 p = _p;
5048 q = _q;
5049 if (q >= e)
5050 break;
5051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053
Benjamin Peterson14339b62009-01-31 16:36:08 +00005054 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005055
5056 if (ch < 0xD800 || ch > 0xDFFF) {
5057 *p++ = ch;
5058 continue;
5059 }
5060
5061 /* UTF-16 code pair: */
5062 if (q > e) {
5063 errmsg = "unexpected end of data";
5064 startinpos = (((const char *)q) - 2) - starts;
5065 endinpos = ((const char *)e) + 1 - starts;
5066 goto utf16Error;
5067 }
5068 if (0xD800 <= ch && ch <= 0xDBFF) {
5069 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5070 q += 2;
5071 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005072#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 *p++ = ch;
5074 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005075#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005077#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 continue;
5079 }
5080 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005081 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 startinpos = (((const char *)q)-4)-starts;
5083 endinpos = startinpos+2;
5084 goto utf16Error;
5085 }
5086
Benjamin Peterson14339b62009-01-31 16:36:08 +00005087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 errmsg = "illegal encoding";
5089 startinpos = (((const char *)q)-2)-starts;
5090 endinpos = startinpos+2;
5091 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 utf16Error:
5094 outpos = p - PyUnicode_AS_UNICODE(unicode);
5095 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005096 errors,
5097 &errorHandler,
5098 "utf16", errmsg,
5099 &starts,
5100 (const char **)&e,
5101 &startinpos,
5102 &endinpos,
5103 &exc,
5104 (const char **)&q,
5105 &unicode,
5106 &outpos,
5107 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005110 /* remaining byte at the end? (size should be even) */
5111 if (e == q) {
5112 if (!consumed) {
5113 errmsg = "truncated data";
5114 startinpos = ((const char *)q) - starts;
5115 endinpos = ((const char *)e) + 1 - starts;
5116 outpos = p - PyUnicode_AS_UNICODE(unicode);
5117 if (unicode_decode_call_errorhandler(
5118 errors,
5119 &errorHandler,
5120 "utf16", errmsg,
5121 &starts,
5122 (const char **)&e,
5123 &startinpos,
5124 &endinpos,
5125 &exc,
5126 (const char **)&q,
5127 &unicode,
5128 &outpos,
5129 &p))
5130 goto onError;
5131 /* The remaining input chars are ignored if the callback
5132 chooses to skip the input */
5133 }
5134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
5136 if (byteorder)
5137 *byteorder = bo;
5138
Walter Dörwald69652032004-09-07 20:24:22 +00005139 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005143 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 goto onError;
5145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 Py_XDECREF(errorHandler);
5147 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005148 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 Py_DECREF(unicode);
5150 return NULL;
5151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return (PyObject *)unicode;
5153
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 Py_XDECREF(errorHandler);
5157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 return NULL;
5159}
5160
Antoine Pitrouab868312009-01-10 15:40:25 +00005161#undef FAST_CHAR_MASK
5162#undef SWAPPED_FAST_CHAR_MASK
5163
Tim Peters772747b2001-08-09 22:21:55 +00005164PyObject *
5165PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 Py_ssize_t size,
5167 const char *errors,
5168 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005170 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005171 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005172 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005173#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005174 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005175#else
5176 const int pairs = 0;
5177#endif
Tim Peters772747b2001-08-09 22:21:55 +00005178 /* Offsets from p for storing byte pairs in the right order. */
5179#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5180 int ihi = 1, ilo = 0;
5181#else
5182 int ihi = 0, ilo = 1;
5183#endif
5184
Benjamin Peterson29060642009-01-31 22:14:21 +00005185#define STORECHAR(CH) \
5186 do { \
5187 p[ihi] = ((CH) >> 8) & 0xff; \
5188 p[ilo] = (CH) & 0xff; \
5189 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005190 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005193 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (s[i] >= 0x10000)
5195 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005196#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005197 /* 2 * (size + pairs + (byteorder == 0)) */
5198 if (size > PY_SSIZE_T_MAX ||
5199 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005201 nsize = size + pairs + (byteorder == 0);
5202 bytesize = nsize * 2;
5203 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005205 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 if (v == NULL)
5207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005209 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005212 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005213 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005214
5215 if (byteorder == -1) {
5216 /* force LE */
5217 ihi = 1;
5218 ilo = 0;
5219 }
5220 else if (byteorder == 1) {
5221 /* force BE */
5222 ihi = 0;
5223 ilo = 1;
5224 }
5225
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005226 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 Py_UNICODE ch = *s++;
5228 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005229#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (ch >= 0x10000) {
5231 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5232 ch = 0xD800 | ((ch-0x10000) >> 10);
5233 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005234#endif
Tim Peters772747b2001-08-09 22:21:55 +00005235 STORECHAR(ch);
5236 if (ch2)
5237 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005238 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005239
5240 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005241 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005242#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243}
5244
Alexander Belopolsky40018472011-02-26 01:02:56 +00005245PyObject *
5246PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
5248 if (!PyUnicode_Check(unicode)) {
5249 PyErr_BadArgument();
5250 return NULL;
5251 }
5252 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 PyUnicode_GET_SIZE(unicode),
5254 NULL,
5255 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
5258/* --- Unicode Escape Codec ----------------------------------------------- */
5259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005260/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5261 if all the escapes in the string make it still a valid ASCII string.
5262 Returns -1 if any escapes were found which cause the string to
5263 pop out of ASCII range. Otherwise returns the length of the
5264 required buffer to hold the string.
5265 */
5266Py_ssize_t
5267length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5268{
5269 const unsigned char *p = (const unsigned char *)s;
5270 const unsigned char *end = p + size;
5271 Py_ssize_t length = 0;
5272
5273 if (size < 0)
5274 return -1;
5275
5276 for (; p < end; ++p) {
5277 if (*p > 127) {
5278 /* Non-ASCII */
5279 return -1;
5280 }
5281 else if (*p != '\\') {
5282 /* Normal character */
5283 ++length;
5284 }
5285 else {
5286 /* Backslash-escape, check next char */
5287 ++p;
5288 /* Escape sequence reaches till end of string or
5289 non-ASCII follow-up. */
5290 if (p >= end || *p > 127)
5291 return -1;
5292 switch (*p) {
5293 case '\n':
5294 /* backslash + \n result in zero characters */
5295 break;
5296 case '\\': case '\'': case '\"':
5297 case 'b': case 'f': case 't':
5298 case 'n': case 'r': case 'v': case 'a':
5299 ++length;
5300 break;
5301 case '0': case '1': case '2': case '3':
5302 case '4': case '5': case '6': case '7':
5303 case 'x': case 'u': case 'U': case 'N':
5304 /* these do not guarantee ASCII characters */
5305 return -1;
5306 default:
5307 /* count the backslash + the other character */
5308 length += 2;
5309 }
5310 }
5311 }
5312 return length;
5313}
5314
5315/* Similar to PyUnicode_WRITE but either write into wstr field
5316 or treat string as ASCII. */
5317#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5318 do { \
5319 if ((kind) != PyUnicode_WCHAR_KIND) \
5320 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5321 else \
5322 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5323 } while (0)
5324
5325#define WRITE_WSTR(buf, index, value) \
5326 assert(kind == PyUnicode_WCHAR_KIND), \
5327 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5328
5329
Fredrik Lundh06d12682001-01-24 07:59:11 +00005330static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005331
Alexander Belopolsky40018472011-02-26 01:02:56 +00005332PyObject *
5333PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005334 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005335 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t startinpos;
5339 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005340 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005344 char* message;
5345 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 PyObject *errorHandler = NULL;
5347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005348 Py_ssize_t ascii_length;
5349 Py_ssize_t i;
5350 int kind;
5351 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005353 ascii_length = length_of_escaped_ascii_string(s, size);
5354
5355 /* After length_of_escaped_ascii_string() there are two alternatives,
5356 either the string is pure ASCII with named escapes like \n, etc.
5357 and we determined it's exact size (common case)
5358 or it contains \x, \u, ... escape sequences. then we create a
5359 legacy wchar string and resize it at the end of this function. */
5360 if (ascii_length >= 0) {
5361 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5362 if (!v)
5363 goto onError;
5364 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5365 kind = PyUnicode_1BYTE_KIND;
5366 data = PyUnicode_DATA(v);
5367 }
5368 else {
5369 /* Escaped strings will always be longer than the resulting
5370 Unicode string, so we start with size here and then reduce the
5371 length after conversion to the true value.
5372 (but if the error callback returns a long replacement string
5373 we'll have to allocate more space) */
5374 v = _PyUnicode_New(size);
5375 if (!v)
5376 goto onError;
5377 kind = PyUnicode_WCHAR_KIND;
5378 data = PyUnicode_AS_UNICODE(v);
5379 }
5380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (size == 0)
5382 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005385
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 while (s < end) {
5387 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005388 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005391 if (kind == PyUnicode_WCHAR_KIND) {
5392 assert(i < _PyUnicode_WSTR_LENGTH(v));
5393 }
5394 else {
5395 /* The only case in which i == ascii_length is a backslash
5396 followed by a newline. */
5397 assert(i <= ascii_length);
5398 }
5399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 /* Non-escape characters are interpreted as Unicode ordinals */
5401 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 continue;
5404 }
5405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 /* \ - Escapes */
5408 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005409 c = *s++;
5410 if (s > end)
5411 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412
5413 if (kind == PyUnicode_WCHAR_KIND) {
5414 assert(i < _PyUnicode_WSTR_LENGTH(v));
5415 }
5416 else {
5417 /* The only case in which i == ascii_length is a backslash
5418 followed by a newline. */
5419 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5420 }
5421
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005422 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5427 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5428 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5429 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5430 /* FF */
5431 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5432 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5433 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5434 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5435 /* VT */
5436 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5437 /* BEL, not classic C */
5438 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 case '0': case '1': case '2': case '3':
5442 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005443 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005444 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005445 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005446 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005447 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 break;
5451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 /* hex escapes */
5453 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455 digits = 2;
5456 message = "truncated \\xXX escape";
5457 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005461 digits = 4;
5462 message = "truncated \\uXXXX escape";
5463 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005466 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005467 digits = 8;
5468 message = "truncated \\UXXXXXXXX escape";
5469 hexescape:
5470 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 if (s+digits>end) {
5473 endinpos = size;
5474 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 errors, &errorHandler,
5476 "unicodeescape", "end of string in escape sequence",
5477 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 goto nextByte;
5482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005483 for (j = 0; j < digits; ++j) {
5484 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005485 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 endinpos = (s+j+1)-starts;
5487 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 errors, &errorHandler,
5490 "unicodeescape", message,
5491 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005496 }
5497 chr = (chr<<4) & ~0xF;
5498 if (c >= '0' && c <= '9')
5499 chr += c - '0';
5500 else if (c >= 'a' && c <= 'f')
5501 chr += 10 + c - 'a';
5502 else
5503 chr += 10 + c - 'A';
5504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005506 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 /* _decoding_error will have already written into the
5508 target buffer. */
5509 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005511 /* when we get here, chr is a 32-bit unicode character */
5512 if (chr <= 0xffff)
5513 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005516 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005517 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005518#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005520#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005521 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5523 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005525 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005527 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 errors, &errorHandler,
5530 "unicodeescape", "illegal Unicode character",
5531 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005533 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 break;
5537
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005539 case 'N':
5540 message = "malformed \\N character escape";
5541 if (ucnhash_CAPI == NULL) {
5542 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5544 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 if (ucnhash_CAPI == NULL)
5546 goto ucnhashError;
5547 }
5548 if (*s == '{') {
5549 const char *start = s+1;
5550 /* look for the closing brace */
5551 while (*s != '}' && s < end)
5552 s++;
5553 if (s > start && s < end && *s == '}') {
5554 /* found a name. look it up in the unicode database */
5555 message = "unknown Unicode character name";
5556 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5558 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005559 goto store;
5560 }
5561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 errors, &errorHandler,
5566 "unicodeescape", message,
5567 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005571 break;
5572
5573 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005574 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 message = "\\ at end of string";
5577 s--;
5578 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 errors, &errorHandler,
5582 "unicodeescape", message,
5583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005585 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005586 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005587 }
5588 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5590 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005591 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005592 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005597 /* Ensure the length prediction worked in case of ASCII strings */
5598 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5599
Victor Stinnerfe226c02011-10-03 03:52:20 +02005600 if (kind == PyUnicode_WCHAR_KIND)
5601 {
5602 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5603 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005604 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005605 Py_XDECREF(errorHandler);
5606 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005607 if (_PyUnicode_READY_REPLACE(&v)) {
5608 Py_DECREF(v);
5609 return NULL;
5610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005614 PyErr_SetString(
5615 PyExc_UnicodeError,
5616 "\\N escapes not supported (can't load unicodedata module)"
5617 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005618 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005621 return NULL;
5622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return NULL;
5628}
5629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630#undef WRITE_ASCII_OR_WSTR
5631#undef WRITE_WSTR
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633/* Return a Unicode-Escape string version of the Unicode object.
5634
5635 If quotes is true, the string is enclosed in u"" or u'' quotes as
5636 appropriate.
5637
5638*/
5639
Walter Dörwald79e913e2007-05-12 11:08:06 +00005640static const char *hexdigits = "0123456789abcdef";
5641
Alexander Belopolsky40018472011-02-26 01:02:56 +00005642PyObject *
5643PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005646 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005649#ifdef Py_UNICODE_WIDE
5650 const Py_ssize_t expandsize = 10;
5651#else
5652 const Py_ssize_t expandsize = 6;
5653#endif
5654
Thomas Wouters89f507f2006-12-13 04:49:30 +00005655 /* XXX(nnorwitz): rather than over-allocating, it would be
5656 better to choose a different scheme. Perhaps scan the
5657 first N-chars of the string and allocate based on that size.
5658 */
5659 /* Initial allocation is based on the longest-possible unichr
5660 escape.
5661
5662 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5663 unichr, so in this case it's the longest unichr escape. In
5664 narrow (UTF-16) builds this is five chars per source unichr
5665 since there are two unichrs in the surrogate pair, so in narrow
5666 (UTF-16) builds it's not the longest unichr escape.
5667
5668 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5669 so in the narrow (UTF-16) build case it's the longest unichr
5670 escape.
5671 */
5672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005673 if (size == 0)
5674 return PyBytes_FromStringAndSize(NULL, 0);
5675
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005676 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005678
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005679 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 2
5681 + expandsize*size
5682 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (repr == NULL)
5684 return NULL;
5685
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005686 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (size-- > 0) {
5689 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005690
Walter Dörwald79e913e2007-05-12 11:08:06 +00005691 /* Escape backslashes */
5692 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 *p++ = '\\';
5694 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005695 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005696 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005697
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005698#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005699 /* Map 21-bit characters to '\U00xxxxxx' */
5700 else if (ch >= 0x10000) {
5701 *p++ = '\\';
5702 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005703 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5704 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5705 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5706 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5707 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5708 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5709 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5710 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005712 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005713#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5715 else if (ch >= 0xD800 && ch < 0xDC00) {
5716 Py_UNICODE ch2;
5717 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 ch2 = *s++;
5720 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005721 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5723 *p++ = '\\';
5724 *p++ = 'U';
5725 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5726 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5727 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5728 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5729 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5730 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5731 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5732 *p++ = hexdigits[ucs & 0x0000000F];
5733 continue;
5734 }
5735 /* Fall through: isolated surrogates are copied as-is */
5736 s--;
5737 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005739#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005740
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005742 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 *p++ = '\\';
5744 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005745 *p++ = hexdigits[(ch >> 12) & 0x000F];
5746 *p++ = hexdigits[(ch >> 8) & 0x000F];
5747 *p++ = hexdigits[(ch >> 4) & 0x000F];
5748 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005750
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005751 /* Map special whitespace to '\t', \n', '\r' */
5752 else if (ch == '\t') {
5753 *p++ = '\\';
5754 *p++ = 't';
5755 }
5756 else if (ch == '\n') {
5757 *p++ = '\\';
5758 *p++ = 'n';
5759 }
5760 else if (ch == '\r') {
5761 *p++ = '\\';
5762 *p++ = 'r';
5763 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005764
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005765 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005766 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005768 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005769 *p++ = hexdigits[(ch >> 4) & 0x000F];
5770 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005771 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 /* Copy everything else as-is */
5774 else
5775 *p++ = (char) ch;
5776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005778 assert(p - PyBytes_AS_STRING(repr) > 0);
5779 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5780 return NULL;
5781 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
Alexander Belopolsky40018472011-02-26 01:02:56 +00005784PyObject *
5785PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005787 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 if (!PyUnicode_Check(unicode)) {
5789 PyErr_BadArgument();
5790 return NULL;
5791 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005792 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5793 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005794 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795}
5796
5797/* --- Raw Unicode Escape Codec ------------------------------------------- */
5798
Alexander Belopolsky40018472011-02-26 01:02:56 +00005799PyObject *
5800PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005801 Py_ssize_t size,
5802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005805 Py_ssize_t startinpos;
5806 Py_ssize_t endinpos;
5807 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 const char *end;
5811 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 PyObject *errorHandler = NULL;
5813 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 /* Escaped strings will always be longer than the resulting
5816 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 length after conversion to the true value. (But decoding error
5818 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 v = _PyUnicode_New(size);
5820 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 end = s + size;
5826 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 unsigned char c;
5828 Py_UCS4 x;
5829 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005830 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* Non-escape characters are interpreted as Unicode ordinals */
5833 if (*s != '\\') {
5834 *p++ = (unsigned char)*s++;
5835 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 startinpos = s-starts;
5838
5839 /* \u-escapes are only interpreted iff the number of leading
5840 backslashes if odd */
5841 bs = s;
5842 for (;s < end;) {
5843 if (*s != '\\')
5844 break;
5845 *p++ = (unsigned char)*s++;
5846 }
5847 if (((s - bs) & 1) == 0 ||
5848 s >= end ||
5849 (*s != 'u' && *s != 'U')) {
5850 continue;
5851 }
5852 p--;
5853 count = *s=='u' ? 4 : 8;
5854 s++;
5855
5856 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5857 outpos = p-PyUnicode_AS_UNICODE(v);
5858 for (x = 0, i = 0; i < count; ++i, ++s) {
5859 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005860 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 endinpos = s-starts;
5862 if (unicode_decode_call_errorhandler(
5863 errors, &errorHandler,
5864 "rawunicodeescape", "truncated \\uXXXX",
5865 &starts, &end, &startinpos, &endinpos, &exc, &s,
5866 &v, &outpos, &p))
5867 goto onError;
5868 goto nextByte;
5869 }
5870 x = (x<<4) & ~0xF;
5871 if (c >= '0' && c <= '9')
5872 x += c - '0';
5873 else if (c >= 'a' && c <= 'f')
5874 x += 10 + c - 'a';
5875 else
5876 x += 10 + c - 'A';
5877 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005878 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 /* UCS-2 character */
5880 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005881 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 /* UCS-4 character. Either store directly, or as
5883 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005884#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005886#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 x -= 0x10000L;
5888 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5889 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005890#endif
5891 } else {
5892 endinpos = s-starts;
5893 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894 if (unicode_decode_call_errorhandler(
5895 errors, &errorHandler,
5896 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 &starts, &end, &startinpos, &endinpos, &exc, &s,
5898 &v, &outpos, &p))
5899 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 nextByte:
5902 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005904 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 Py_XDECREF(errorHandler);
5907 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005908 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005909 Py_DECREF(v);
5910 return NULL;
5911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(errorHandler);
5917 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 return NULL;
5919}
5920
Alexander Belopolsky40018472011-02-26 01:02:56 +00005921PyObject *
5922PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005923 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 char *p;
5927 char *q;
5928
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005929#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005930 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005931#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005932 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005933#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005934
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005935 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005937
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (repr == NULL)
5940 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005941 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 while (size-- > 0) {
5946 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005947#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 /* Map 32-bit characters to '\Uxxxxxxxx' */
5949 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005950 *p++ = '\\';
5951 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005952 *p++ = hexdigits[(ch >> 28) & 0xf];
5953 *p++ = hexdigits[(ch >> 24) & 0xf];
5954 *p++ = hexdigits[(ch >> 20) & 0xf];
5955 *p++ = hexdigits[(ch >> 16) & 0xf];
5956 *p++ = hexdigits[(ch >> 12) & 0xf];
5957 *p++ = hexdigits[(ch >> 8) & 0xf];
5958 *p++ = hexdigits[(ch >> 4) & 0xf];
5959 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005960 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005961 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005962#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5964 if (ch >= 0xD800 && ch < 0xDC00) {
5965 Py_UNICODE ch2;
5966 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005967
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 ch2 = *s++;
5969 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005970 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5972 *p++ = '\\';
5973 *p++ = 'U';
5974 *p++ = hexdigits[(ucs >> 28) & 0xf];
5975 *p++ = hexdigits[(ucs >> 24) & 0xf];
5976 *p++ = hexdigits[(ucs >> 20) & 0xf];
5977 *p++ = hexdigits[(ucs >> 16) & 0xf];
5978 *p++ = hexdigits[(ucs >> 12) & 0xf];
5979 *p++ = hexdigits[(ucs >> 8) & 0xf];
5980 *p++ = hexdigits[(ucs >> 4) & 0xf];
5981 *p++ = hexdigits[ucs & 0xf];
5982 continue;
5983 }
5984 /* Fall through: isolated surrogates are copied as-is */
5985 s--;
5986 size++;
5987 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005988#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* Map 16-bit characters to '\uxxxx' */
5990 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 *p++ = '\\';
5992 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005993 *p++ = hexdigits[(ch >> 12) & 0xf];
5994 *p++ = hexdigits[(ch >> 8) & 0xf];
5995 *p++ = hexdigits[(ch >> 4) & 0xf];
5996 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* Copy everything else as-is */
5999 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = (char) ch;
6001 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006002 size = p - q;
6003
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006004 assert(size > 0);
6005 if (_PyBytes_Resize(&repr, size) < 0)
6006 return NULL;
6007 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008}
6009
Alexander Belopolsky40018472011-02-26 01:02:56 +00006010PyObject *
6011PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006013 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006015 PyErr_BadArgument();
6016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006018 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6019 PyUnicode_GET_SIZE(unicode));
6020
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006021 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006024/* --- Unicode Internal Codec ------------------------------------------- */
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
6027_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 Py_ssize_t size,
6029 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030{
6031 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 Py_ssize_t startinpos;
6033 Py_ssize_t endinpos;
6034 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006035 PyUnicodeObject *v;
6036 Py_UNICODE *p;
6037 const char *end;
6038 const char *reason;
6039 PyObject *errorHandler = NULL;
6040 PyObject *exc = NULL;
6041
Neal Norwitzd43069c2006-01-08 01:12:10 +00006042#ifdef Py_UNICODE_WIDE
6043 Py_UNICODE unimax = PyUnicode_GetMax();
6044#endif
6045
Thomas Wouters89f507f2006-12-13 04:49:30 +00006046 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6048 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6051 as string was created with the old API. */
6052 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006054 p = PyUnicode_AS_UNICODE(v);
6055 end = s + size;
6056
6057 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006058 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006059 /* We have to sanity check the raw data, otherwise doom looms for
6060 some malformed UCS-4 data. */
6061 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006062#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006063 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006064#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006065 end-s < Py_UNICODE_SIZE
6066 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 startinpos = s - starts;
6069 if (end-s < Py_UNICODE_SIZE) {
6070 endinpos = end-starts;
6071 reason = "truncated input";
6072 }
6073 else {
6074 endinpos = s - starts + Py_UNICODE_SIZE;
6075 reason = "illegal code point (> 0x10FFFF)";
6076 }
6077 outpos = p - PyUnicode_AS_UNICODE(v);
6078 if (unicode_decode_call_errorhandler(
6079 errors, &errorHandler,
6080 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006081 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006082 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006083 goto onError;
6084 }
6085 }
6086 else {
6087 p++;
6088 s += Py_UNICODE_SIZE;
6089 }
6090 }
6091
Victor Stinnerfe226c02011-10-03 03:52:20 +02006092 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006093 goto onError;
6094 Py_XDECREF(errorHandler);
6095 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006096 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006097 Py_DECREF(v);
6098 return NULL;
6099 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006100 return (PyObject *)v;
6101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006103 Py_XDECREF(v);
6104 Py_XDECREF(errorHandler);
6105 Py_XDECREF(exc);
6106 return NULL;
6107}
6108
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109/* --- Latin-1 Codec ------------------------------------------------------ */
6110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
6112PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 Py_ssize_t size,
6114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006117 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118}
6119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006121static void
6122make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006123 const char *encoding,
6124 const Py_UNICODE *unicode, Py_ssize_t size,
6125 Py_ssize_t startpos, Py_ssize_t endpos,
6126 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 *exceptionObject = PyUnicodeEncodeError_Create(
6130 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
6132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6134 goto onError;
6135 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6136 goto onError;
6137 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6138 goto onError;
6139 return;
6140 onError:
6141 Py_DECREF(*exceptionObject);
6142 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 }
6144}
6145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147static void
6148raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006149 const char *encoding,
6150 const Py_UNICODE *unicode, Py_ssize_t size,
6151 Py_ssize_t startpos, Py_ssize_t endpos,
6152 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153{
6154 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158}
6159
6160/* error handling callback helper:
6161 build arguments, call the callback and check the arguments,
6162 put the result into newpos and return the replacement string, which
6163 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164static PyObject *
6165unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006166 PyObject **errorHandler,
6167 const char *encoding, const char *reason,
6168 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6169 Py_ssize_t startpos, Py_ssize_t endpos,
6170 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006172 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173
6174 PyObject *restuple;
6175 PyObject *resunicode;
6176
6177 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 }
6182
6183 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187
6188 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006193 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 Py_DECREF(restuple);
6195 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006197 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 &resunicode, newpos)) {
6199 Py_DECREF(restuple);
6200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006202 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6203 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6204 Py_DECREF(restuple);
6205 return NULL;
6206 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006209 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6211 Py_DECREF(restuple);
6212 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_INCREF(resunicode);
6215 Py_DECREF(restuple);
6216 return resunicode;
6217}
6218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219static PyObject *
6220unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006221 Py_ssize_t size,
6222 const char *errors,
6223 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224{
6225 /* output object */
6226 PyObject *res;
6227 /* pointers to the beginning and end+1 of input */
6228 const Py_UNICODE *startp = p;
6229 const Py_UNICODE *endp = p + size;
6230 /* pointer to the beginning of the unencodable characters */
6231 /* const Py_UNICODE *badp = NULL; */
6232 /* pointer into the output */
6233 char *str;
6234 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006235 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006236 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6237 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 PyObject *errorHandler = NULL;
6239 PyObject *exc = NULL;
6240 /* the following variable is used for caching string comparisons
6241 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6242 int known_errorHandler = -1;
6243
6244 /* allocate enough for a simple encoding without
6245 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006246 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006247 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006250 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006251 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 ressize = size;
6253
6254 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 /* can we encode this? */
6258 if (c<limit) {
6259 /* no overflow check, because we know that the space is enough */
6260 *str++ = (char)c;
6261 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006262 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 else {
6264 Py_ssize_t unicodepos = p-startp;
6265 Py_ssize_t requiredsize;
6266 PyObject *repunicode;
6267 Py_ssize_t repsize;
6268 Py_ssize_t newpos;
6269 Py_ssize_t respos;
6270 Py_UNICODE *uni2;
6271 /* startpos for collecting unencodable chars */
6272 const Py_UNICODE *collstart = p;
6273 const Py_UNICODE *collend = p;
6274 /* find all unecodable characters */
6275 while ((collend < endp) && ((*collend)>=limit))
6276 ++collend;
6277 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6278 if (known_errorHandler==-1) {
6279 if ((errors==NULL) || (!strcmp(errors, "strict")))
6280 known_errorHandler = 1;
6281 else if (!strcmp(errors, "replace"))
6282 known_errorHandler = 2;
6283 else if (!strcmp(errors, "ignore"))
6284 known_errorHandler = 3;
6285 else if (!strcmp(errors, "xmlcharrefreplace"))
6286 known_errorHandler = 4;
6287 else
6288 known_errorHandler = 0;
6289 }
6290 switch (known_errorHandler) {
6291 case 1: /* strict */
6292 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6293 goto onError;
6294 case 2: /* replace */
6295 while (collstart++<collend)
6296 *str++ = '?'; /* fall through */
6297 case 3: /* ignore */
6298 p = collend;
6299 break;
6300 case 4: /* xmlcharrefreplace */
6301 respos = str - PyBytes_AS_STRING(res);
6302 /* determine replacement size (temporarily (mis)uses p) */
6303 for (p = collstart, repsize = 0; p < collend; ++p) {
6304 if (*p<10)
6305 repsize += 2+1+1;
6306 else if (*p<100)
6307 repsize += 2+2+1;
6308 else if (*p<1000)
6309 repsize += 2+3+1;
6310 else if (*p<10000)
6311 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006312#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 else
6314 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006315#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 else if (*p<100000)
6317 repsize += 2+5+1;
6318 else if (*p<1000000)
6319 repsize += 2+6+1;
6320 else
6321 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006322#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 }
6324 requiredsize = respos+repsize+(endp-collend);
6325 if (requiredsize > ressize) {
6326 if (requiredsize<2*ressize)
6327 requiredsize = 2*ressize;
6328 if (_PyBytes_Resize(&res, requiredsize))
6329 goto onError;
6330 str = PyBytes_AS_STRING(res) + respos;
6331 ressize = requiredsize;
6332 }
6333 /* generate replacement (temporarily (mis)uses p) */
6334 for (p = collstart; p < collend; ++p) {
6335 str += sprintf(str, "&#%d;", (int)*p);
6336 }
6337 p = collend;
6338 break;
6339 default:
6340 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6341 encoding, reason, startp, size, &exc,
6342 collstart-startp, collend-startp, &newpos);
6343 if (repunicode == NULL)
6344 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006345 if (PyBytes_Check(repunicode)) {
6346 /* Directly copy bytes result to output. */
6347 repsize = PyBytes_Size(repunicode);
6348 if (repsize > 1) {
6349 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006350 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006351 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6352 Py_DECREF(repunicode);
6353 goto onError;
6354 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006355 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006356 ressize += repsize-1;
6357 }
6358 memcpy(str, PyBytes_AsString(repunicode), repsize);
6359 str += repsize;
6360 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006361 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006362 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* need more space? (at least enough for what we
6365 have+the replacement+the rest of the string, so
6366 we won't have to check space for encodable characters) */
6367 respos = str - PyBytes_AS_STRING(res);
6368 repsize = PyUnicode_GET_SIZE(repunicode);
6369 requiredsize = respos+repsize+(endp-collend);
6370 if (requiredsize > ressize) {
6371 if (requiredsize<2*ressize)
6372 requiredsize = 2*ressize;
6373 if (_PyBytes_Resize(&res, requiredsize)) {
6374 Py_DECREF(repunicode);
6375 goto onError;
6376 }
6377 str = PyBytes_AS_STRING(res) + respos;
6378 ressize = requiredsize;
6379 }
6380 /* check if there is anything unencodable in the replacement
6381 and copy it to the output */
6382 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6383 c = *uni2;
6384 if (c >= limit) {
6385 raise_encode_exception(&exc, encoding, startp, size,
6386 unicodepos, unicodepos+1, reason);
6387 Py_DECREF(repunicode);
6388 goto onError;
6389 }
6390 *str = (char)c;
6391 }
6392 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006393 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006395 }
6396 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006397 /* Resize if we allocated to much */
6398 size = str - PyBytes_AS_STRING(res);
6399 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006400 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006401 if (_PyBytes_Resize(&res, size) < 0)
6402 goto onError;
6403 }
6404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 Py_XDECREF(errorHandler);
6406 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006407 return res;
6408
6409 onError:
6410 Py_XDECREF(res);
6411 Py_XDECREF(errorHandler);
6412 Py_XDECREF(exc);
6413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414}
6415
Alexander Belopolsky40018472011-02-26 01:02:56 +00006416PyObject *
6417PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006418 Py_ssize_t size,
6419 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
6427 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 PyErr_BadArgument();
6429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431 if (PyUnicode_READY(unicode) == -1)
6432 return NULL;
6433 /* Fast path: if it is a one-byte string, construct
6434 bytes object directly. */
6435 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6436 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6437 PyUnicode_GET_LENGTH(unicode));
6438 /* Non-Latin-1 characters present. Defer to above function to
6439 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006442 errors);
6443}
6444
6445PyObject*
6446PyUnicode_AsLatin1String(PyObject *unicode)
6447{
6448 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
6451/* --- 7-bit ASCII Codec -------------------------------------------------- */
6452
Alexander Belopolsky40018472011-02-26 01:02:56 +00006453PyObject *
6454PyUnicode_DecodeASCII(const char *s,
6455 Py_ssize_t size,
6456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 PyUnicodeObject *v;
6460 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t startinpos;
6462 Py_ssize_t endinpos;
6463 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006465 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 PyObject *errorHandler = NULL;
6467 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006471 if (size == 1 && *(unsigned char*)s < 128)
6472 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6473
6474 /* Fast path. Assume the input actually *is* ASCII, and allocate
6475 a single-block Unicode object with that assumption. If there is
6476 an error, drop the object and start over. */
6477 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6478 if (v == NULL)
6479 goto onError;
6480 d = PyUnicode_1BYTE_DATA(v);
6481 for (i = 0; i < size; i++) {
6482 unsigned char ch = ((unsigned char*)s)[i];
6483 if (ch < 128)
6484 d[i] = ch;
6485 else
6486 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006488 if (i == size)
6489 return (PyObject*)v;
6490 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006491
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 v = _PyUnicode_New(size);
6493 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 e = s + size;
6499 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 register unsigned char c = (unsigned char)*s;
6501 if (c < 128) {
6502 *p++ = c;
6503 ++s;
6504 }
6505 else {
6506 startinpos = s-starts;
6507 endinpos = startinpos + 1;
6508 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6509 if (unicode_decode_call_errorhandler(
6510 errors, &errorHandler,
6511 "ascii", "ordinal not in range(128)",
6512 &starts, &e, &startinpos, &endinpos, &exc, &s,
6513 &v, &outpos, &p))
6514 goto onError;
6515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006517 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006518 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520 Py_XDECREF(errorHandler);
6521 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006522 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006523 Py_DECREF(v);
6524 return NULL;
6525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006527
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006530 Py_XDECREF(errorHandler);
6531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 return NULL;
6533}
6534
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535PyObject *
6536PyUnicode_EncodeASCII(const Py_UNICODE *p,
6537 Py_ssize_t size,
6538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541}
6542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
6546 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 PyErr_BadArgument();
6548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550 if (PyUnicode_READY(unicode) == -1)
6551 return NULL;
6552 /* Fast path: if it is an ASCII-only string, construct bytes object
6553 directly. Else defer to above function to raise the exception. */
6554 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6555 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6556 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006559 errors);
6560}
6561
6562PyObject *
6563PyUnicode_AsASCIIString(PyObject *unicode)
6564{
6565 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566}
6567
Victor Stinner99b95382011-07-04 14:23:54 +02006568#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006569
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006570/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006571
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006572#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573#define NEED_RETRY
6574#endif
6575
6576/* XXX This code is limited to "true" double-byte encodings, as
6577 a) it assumes an incomplete character consists of a single byte, and
6578 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581static int
6582is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583{
6584 const char *curr = s + offset;
6585
6586 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 const char *prev = CharPrev(s, curr);
6588 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589 }
6590 return 0;
6591}
6592
6593/*
6594 * Decode MBCS string into unicode object. If 'final' is set, converts
6595 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6596 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static int
6598decode_mbcs(PyUnicodeObject **v,
6599 const char *s, /* MBCS string */
6600 int size, /* sizeof MBCS string */
6601 int final,
6602 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603{
6604 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006605 Py_ssize_t n;
6606 DWORD usize;
6607 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006608
6609 assert(size >= 0);
6610
Victor Stinner554f3f02010-06-16 23:33:54 +00006611 /* check and handle 'errors' arg */
6612 if (errors==NULL || strcmp(errors, "strict")==0)
6613 flags = MB_ERR_INVALID_CHARS;
6614 else if (strcmp(errors, "ignore")==0)
6615 flags = 0;
6616 else {
6617 PyErr_Format(PyExc_ValueError,
6618 "mbcs encoding does not support errors='%s'",
6619 errors);
6620 return -1;
6621 }
6622
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623 /* Skip trailing lead-byte unless 'final' is set */
6624 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
6627 /* First get the size of the result */
6628 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006629 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6630 if (usize==0)
6631 goto mbcs_decode_error;
6632 } else
6633 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634
6635 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 /* Create unicode object */
6637 *v = _PyUnicode_New(usize);
6638 if (*v == NULL)
6639 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006640 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006641 }
6642 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 /* Extend unicode object */
6644 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006645 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006647 }
6648
6649 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006650 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006652 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6653 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006655 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006657
6658mbcs_decode_error:
6659 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6660 we raise a UnicodeDecodeError - else it is a 'generic'
6661 windows error
6662 */
6663 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6664 /* Ideally, we should get reason from FormatMessage - this
6665 is the Windows 2000 English version of the message
6666 */
6667 PyObject *exc = NULL;
6668 const char *reason = "No mapping for the Unicode character exists "
6669 "in the target multi-byte code page.";
6670 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6671 if (exc != NULL) {
6672 PyCodec_StrictErrors(exc);
6673 Py_DECREF(exc);
6674 }
6675 } else {
6676 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6677 }
6678 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006679}
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
6682PyUnicode_DecodeMBCSStateful(const char *s,
6683 Py_ssize_t size,
6684 const char *errors,
6685 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686{
6687 PyUnicodeObject *v = NULL;
6688 int done;
6689
6690 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692
6693#ifdef NEED_RETRY
6694 retry:
6695 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006696 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 else
6698#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006699 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700
6701 if (done < 0) {
6702 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 }
6705
6706 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708
6709#ifdef NEED_RETRY
6710 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 s += done;
6712 size -= done;
6713 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 }
6715#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006716 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006717 Py_DECREF(v);
6718 return NULL;
6719 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720 return (PyObject *)v;
6721}
6722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_DecodeMBCS(const char *s,
6725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006727{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006728 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6729}
6730
6731/*
6732 * Convert unicode into string object (MBCS).
6733 * Returns 0 if succeed, -1 otherwise.
6734 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006735static int
6736encode_mbcs(PyObject **repr,
6737 const Py_UNICODE *p, /* unicode */
6738 int size, /* size of unicode */
6739 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740{
Victor Stinner554f3f02010-06-16 23:33:54 +00006741 BOOL usedDefaultChar = FALSE;
6742 BOOL *pusedDefaultChar;
6743 int mbcssize;
6744 Py_ssize_t n;
6745 PyObject *exc = NULL;
6746 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006747
6748 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006749
Victor Stinner554f3f02010-06-16 23:33:54 +00006750 /* check and handle 'errors' arg */
6751 if (errors==NULL || strcmp(errors, "strict")==0) {
6752 flags = WC_NO_BEST_FIT_CHARS;
6753 pusedDefaultChar = &usedDefaultChar;
6754 } else if (strcmp(errors, "replace")==0) {
6755 flags = 0;
6756 pusedDefaultChar = NULL;
6757 } else {
6758 PyErr_Format(PyExc_ValueError,
6759 "mbcs encoding does not support errors='%s'",
6760 errors);
6761 return -1;
6762 }
6763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006764 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006766 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6767 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 if (mbcssize == 0) {
6769 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6770 return -1;
6771 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006772 /* If we used a default char, then we failed! */
6773 if (pusedDefaultChar && *pusedDefaultChar)
6774 goto mbcs_encode_error;
6775 } else {
6776 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006777 }
6778
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* Create string object */
6781 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6782 if (*repr == NULL)
6783 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006784 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 }
6786 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 /* Extend string object */
6788 n = PyBytes_Size(*repr);
6789 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6790 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791 }
6792
6793 /* Do the conversion */
6794 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006796 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6797 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6799 return -1;
6800 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006801 if (pusedDefaultChar && *pusedDefaultChar)
6802 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006805
6806mbcs_encode_error:
6807 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6808 Py_XDECREF(exc);
6809 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006810}
6811
Alexander Belopolsky40018472011-02-26 01:02:56 +00006812PyObject *
6813PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6814 Py_ssize_t size,
6815 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817 PyObject *repr = NULL;
6818 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006819
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006823 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 else
6825#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006827
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 Py_XDECREF(repr);
6830 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006831 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832
6833#ifdef NEED_RETRY
6834 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 p += INT_MAX;
6836 size -= INT_MAX;
6837 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006838 }
6839#endif
6840
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006841 return repr;
6842}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006843
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844PyObject *
6845PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006846{
6847 if (!PyUnicode_Check(unicode)) {
6848 PyErr_BadArgument();
6849 return NULL;
6850 }
6851 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 PyUnicode_GET_SIZE(unicode),
6853 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006854}
6855
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856#undef NEED_RETRY
6857
Victor Stinner99b95382011-07-04 14:23:54 +02006858#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860/* --- Character Mapping Codec -------------------------------------------- */
6861
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862PyObject *
6863PyUnicode_DecodeCharmap(const char *s,
6864 Py_ssize_t size,
6865 PyObject *mapping,
6866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006869 Py_ssize_t startinpos;
6870 Py_ssize_t endinpos;
6871 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 PyUnicodeObject *v;
6874 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006875 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 PyObject *errorHandler = NULL;
6877 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006878 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 /* Default to Latin-1 */
6882 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
6885 v = _PyUnicode_New(size);
6886 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006892 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 mapstring = PyUnicode_AS_UNICODE(mapping);
6894 maplen = PyUnicode_GET_SIZE(mapping);
6895 while (s < e) {
6896 unsigned char ch = *s;
6897 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 if (ch < maplen)
6900 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 if (x == 0xfffe) {
6903 /* undefined mapping */
6904 outpos = p-PyUnicode_AS_UNICODE(v);
6905 startinpos = s-starts;
6906 endinpos = startinpos+1;
6907 if (unicode_decode_call_errorhandler(
6908 errors, &errorHandler,
6909 "charmap", "character maps to <undefined>",
6910 &starts, &e, &startinpos, &endinpos, &exc, &s,
6911 &v, &outpos, &p)) {
6912 goto onError;
6913 }
6914 continue;
6915 }
6916 *p++ = x;
6917 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006919 }
6920 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 while (s < e) {
6922 unsigned char ch = *s;
6923 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006924
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6926 w = PyLong_FromLong((long)ch);
6927 if (w == NULL)
6928 goto onError;
6929 x = PyObject_GetItem(mapping, w);
6930 Py_DECREF(w);
6931 if (x == NULL) {
6932 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6933 /* No mapping found means: mapping is undefined. */
6934 PyErr_Clear();
6935 x = Py_None;
6936 Py_INCREF(x);
6937 } else
6938 goto onError;
6939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 /* Apply mapping */
6942 if (PyLong_Check(x)) {
6943 long value = PyLong_AS_LONG(x);
6944 if (value < 0 || value > 65535) {
6945 PyErr_SetString(PyExc_TypeError,
6946 "character mapping must be in range(65536)");
6947 Py_DECREF(x);
6948 goto onError;
6949 }
6950 *p++ = (Py_UNICODE)value;
6951 }
6952 else if (x == Py_None) {
6953 /* undefined mapping */
6954 outpos = p-PyUnicode_AS_UNICODE(v);
6955 startinpos = s-starts;
6956 endinpos = startinpos+1;
6957 if (unicode_decode_call_errorhandler(
6958 errors, &errorHandler,
6959 "charmap", "character maps to <undefined>",
6960 &starts, &e, &startinpos, &endinpos, &exc, &s,
6961 &v, &outpos, &p)) {
6962 Py_DECREF(x);
6963 goto onError;
6964 }
6965 Py_DECREF(x);
6966 continue;
6967 }
6968 else if (PyUnicode_Check(x)) {
6969 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006970
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 if (targetsize == 1)
6972 /* 1-1 mapping */
6973 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 else if (targetsize > 1) {
6976 /* 1-n mapping */
6977 if (targetsize > extrachars) {
6978 /* resize first */
6979 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6980 Py_ssize_t needed = (targetsize - extrachars) + \
6981 (targetsize << 2);
6982 extrachars += needed;
6983 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006984 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 PyUnicode_GET_SIZE(v) + needed) < 0) {
6986 Py_DECREF(x);
6987 goto onError;
6988 }
6989 p = PyUnicode_AS_UNICODE(v) + oldpos;
6990 }
6991 Py_UNICODE_COPY(p,
6992 PyUnicode_AS_UNICODE(x),
6993 targetsize);
6994 p += targetsize;
6995 extrachars -= targetsize;
6996 }
6997 /* 1-0 mapping: skip the character */
6998 }
6999 else {
7000 /* wrong return value */
7001 PyErr_SetString(PyExc_TypeError,
7002 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007003 Py_DECREF(x);
7004 goto onError;
7005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 Py_DECREF(x);
7007 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 }
7010 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007011 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(errorHandler);
7014 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007015 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007016 Py_DECREF(v);
7017 return NULL;
7018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007022 Py_XDECREF(errorHandler);
7023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 Py_XDECREF(v);
7025 return NULL;
7026}
7027
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007028/* Charmap encoding: the lookup table */
7029
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 PyObject_HEAD
7032 unsigned char level1[32];
7033 int count2, count3;
7034 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007035};
7036
7037static PyObject*
7038encoding_map_size(PyObject *obj, PyObject* args)
7039{
7040 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007043}
7044
7045static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007046 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyDoc_STR("Return the size (in bytes) of this object") },
7048 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007049};
7050
7051static void
7052encoding_map_dealloc(PyObject* o)
7053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007054 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007055}
7056
7057static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 "EncodingMap", /*tp_name*/
7060 sizeof(struct encoding_map), /*tp_basicsize*/
7061 0, /*tp_itemsize*/
7062 /* methods */
7063 encoding_map_dealloc, /*tp_dealloc*/
7064 0, /*tp_print*/
7065 0, /*tp_getattr*/
7066 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007067 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 0, /*tp_repr*/
7069 0, /*tp_as_number*/
7070 0, /*tp_as_sequence*/
7071 0, /*tp_as_mapping*/
7072 0, /*tp_hash*/
7073 0, /*tp_call*/
7074 0, /*tp_str*/
7075 0, /*tp_getattro*/
7076 0, /*tp_setattro*/
7077 0, /*tp_as_buffer*/
7078 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7079 0, /*tp_doc*/
7080 0, /*tp_traverse*/
7081 0, /*tp_clear*/
7082 0, /*tp_richcompare*/
7083 0, /*tp_weaklistoffset*/
7084 0, /*tp_iter*/
7085 0, /*tp_iternext*/
7086 encoding_map_methods, /*tp_methods*/
7087 0, /*tp_members*/
7088 0, /*tp_getset*/
7089 0, /*tp_base*/
7090 0, /*tp_dict*/
7091 0, /*tp_descr_get*/
7092 0, /*tp_descr_set*/
7093 0, /*tp_dictoffset*/
7094 0, /*tp_init*/
7095 0, /*tp_alloc*/
7096 0, /*tp_new*/
7097 0, /*tp_free*/
7098 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099};
7100
7101PyObject*
7102PyUnicode_BuildEncodingMap(PyObject* string)
7103{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007104 PyObject *result;
7105 struct encoding_map *mresult;
7106 int i;
7107 int need_dict = 0;
7108 unsigned char level1[32];
7109 unsigned char level2[512];
7110 unsigned char *mlevel1, *mlevel2, *mlevel3;
7111 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 int kind;
7113 void *data;
7114 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007116 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007117 PyErr_BadArgument();
7118 return NULL;
7119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 kind = PyUnicode_KIND(string);
7121 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007122 memset(level1, 0xFF, sizeof level1);
7123 memset(level2, 0xFF, sizeof level2);
7124
7125 /* If there isn't a one-to-one mapping of NULL to \0,
7126 or if there are non-BMP characters, we need to use
7127 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007129 need_dict = 1;
7130 for (i = 1; i < 256; i++) {
7131 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007132 ch = PyUnicode_READ(kind, data, i);
7133 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007134 need_dict = 1;
7135 break;
7136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007137 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007138 /* unmapped character */
7139 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007140 l1 = ch >> 11;
7141 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007142 if (level1[l1] == 0xFF)
7143 level1[l1] = count2++;
7144 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007145 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007146 }
7147
7148 if (count2 >= 0xFF || count3 >= 0xFF)
7149 need_dict = 1;
7150
7151 if (need_dict) {
7152 PyObject *result = PyDict_New();
7153 PyObject *key, *value;
7154 if (!result)
7155 return NULL;
7156 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007157 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007158 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007159 if (!key || !value)
7160 goto failed1;
7161 if (PyDict_SetItem(result, key, value) == -1)
7162 goto failed1;
7163 Py_DECREF(key);
7164 Py_DECREF(value);
7165 }
7166 return result;
7167 failed1:
7168 Py_XDECREF(key);
7169 Py_XDECREF(value);
7170 Py_DECREF(result);
7171 return NULL;
7172 }
7173
7174 /* Create a three-level trie */
7175 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7176 16*count2 + 128*count3 - 1);
7177 if (!result)
7178 return PyErr_NoMemory();
7179 PyObject_Init(result, &EncodingMapType);
7180 mresult = (struct encoding_map*)result;
7181 mresult->count2 = count2;
7182 mresult->count3 = count3;
7183 mlevel1 = mresult->level1;
7184 mlevel2 = mresult->level23;
7185 mlevel3 = mresult->level23 + 16*count2;
7186 memcpy(mlevel1, level1, 32);
7187 memset(mlevel2, 0xFF, 16*count2);
7188 memset(mlevel3, 0, 128*count3);
7189 count3 = 0;
7190 for (i = 1; i < 256; i++) {
7191 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007193 /* unmapped character */
7194 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007195 o1 = PyUnicode_READ(kind, data, i)>>11;
7196 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007197 i2 = 16*mlevel1[o1] + o2;
7198 if (mlevel2[i2] == 0xFF)
7199 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007200 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007201 i3 = 128*mlevel2[i2] + o3;
7202 mlevel3[i3] = i;
7203 }
7204 return result;
7205}
7206
7207static int
7208encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7209{
7210 struct encoding_map *map = (struct encoding_map*)mapping;
7211 int l1 = c>>11;
7212 int l2 = (c>>7) & 0xF;
7213 int l3 = c & 0x7F;
7214 int i;
7215
7216#ifdef Py_UNICODE_WIDE
7217 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007219 }
7220#endif
7221 if (c == 0)
7222 return 0;
7223 /* level 1*/
7224 i = map->level1[l1];
7225 if (i == 0xFF) {
7226 return -1;
7227 }
7228 /* level 2*/
7229 i = map->level23[16*i+l2];
7230 if (i == 0xFF) {
7231 return -1;
7232 }
7233 /* level 3 */
7234 i = map->level23[16*map->count2 + 128*i + l3];
7235 if (i == 0) {
7236 return -1;
7237 }
7238 return i;
7239}
7240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241/* Lookup the character ch in the mapping. If the character
7242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007243 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007244static PyObject *
7245charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
Christian Heimes217cfd12007-12-02 14:31:20 +00007247 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007248 PyObject *x;
7249
7250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 x = PyObject_GetItem(mapping, w);
7253 Py_DECREF(w);
7254 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7256 /* No mapping found means: mapping is undefined. */
7257 PyErr_Clear();
7258 x = Py_None;
7259 Py_INCREF(x);
7260 return x;
7261 } else
7262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007264 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007266 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 long value = PyLong_AS_LONG(x);
7268 if (value < 0 || value > 255) {
7269 PyErr_SetString(PyExc_TypeError,
7270 "character mapping must be in range(256)");
7271 Py_DECREF(x);
7272 return NULL;
7273 }
7274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007276 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 /* wrong return value */
7280 PyErr_Format(PyExc_TypeError,
7281 "character mapping must return integer, bytes or None, not %.400s",
7282 x->ob_type->tp_name);
7283 Py_DECREF(x);
7284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
7286}
7287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007288static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007291 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7292 /* exponentially overallocate to minimize reallocations */
7293 if (requiredsize < 2*outsize)
7294 requiredsize = 2*outsize;
7295 if (_PyBytes_Resize(outobj, requiredsize))
7296 return -1;
7297 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007298}
7299
Benjamin Peterson14339b62009-01-31 16:36:08 +00007300typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007302} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007304 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305 space is available. Return a new reference to the object that
7306 was put in the output buffer, or Py_None, if the mapping was undefined
7307 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007308 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static charmapencode_result
7310charmapencode_output(Py_UNICODE c, PyObject *mapping,
7311 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007313 PyObject *rep;
7314 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316
Christian Heimes90aa7642007-12-19 02:45:37 +00007317 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007318 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007320 if (res == -1)
7321 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 if (outsize<requiredsize)
7323 if (charmapencode_resize(outobj, outpos, requiredsize))
7324 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 outstart[(*outpos)++] = (char)res;
7327 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007328 }
7329
7330 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007331 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007333 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 Py_DECREF(rep);
7335 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007336 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 if (PyLong_Check(rep)) {
7338 Py_ssize_t requiredsize = *outpos+1;
7339 if (outsize<requiredsize)
7340 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7341 Py_DECREF(rep);
7342 return enc_EXCEPTION;
7343 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 else {
7348 const char *repchars = PyBytes_AS_STRING(rep);
7349 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7350 Py_ssize_t requiredsize = *outpos+repsize;
7351 if (outsize<requiredsize)
7352 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7353 Py_DECREF(rep);
7354 return enc_EXCEPTION;
7355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007356 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 memcpy(outstart + *outpos, repchars, repsize);
7358 *outpos += repsize;
7359 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007361 Py_DECREF(rep);
7362 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363}
7364
7365/* handle an error in PyUnicode_EncodeCharmap
7366 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367static int
7368charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007371 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007372 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373{
7374 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007375 Py_ssize_t repsize;
7376 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 Py_UNICODE *uni2;
7378 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379 Py_ssize_t collstartpos = *inpos;
7380 Py_ssize_t collendpos = *inpos+1;
7381 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 char *encoding = "charmap";
7383 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007386 /* find all unencodable characters */
7387 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007388 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007389 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 int res = encoding_map_lookup(p[collendpos], mapping);
7391 if (res != -1)
7392 break;
7393 ++collendpos;
7394 continue;
7395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 rep = charmapencode_lookup(p[collendpos], mapping);
7398 if (rep==NULL)
7399 return -1;
7400 else if (rep!=Py_None) {
7401 Py_DECREF(rep);
7402 break;
7403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007406 }
7407 /* cache callback name lookup
7408 * (if not done yet, i.e. it's the first error) */
7409 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 if ((errors==NULL) || (!strcmp(errors, "strict")))
7411 *known_errorHandler = 1;
7412 else if (!strcmp(errors, "replace"))
7413 *known_errorHandler = 2;
7414 else if (!strcmp(errors, "ignore"))
7415 *known_errorHandler = 3;
7416 else if (!strcmp(errors, "xmlcharrefreplace"))
7417 *known_errorHandler = 4;
7418 else
7419 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007420 }
7421 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422 case 1: /* strict */
7423 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7424 return -1;
7425 case 2: /* replace */
7426 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 x = charmapencode_output('?', mapping, res, respos);
7428 if (x==enc_EXCEPTION) {
7429 return -1;
7430 }
7431 else if (x==enc_FAILED) {
7432 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7433 return -1;
7434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 }
7436 /* fall through */
7437 case 3: /* ignore */
7438 *inpos = collendpos;
7439 break;
7440 case 4: /* xmlcharrefreplace */
7441 /* generate replacement (temporarily (mis)uses p) */
7442 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 char buffer[2+29+1+1];
7444 char *cp;
7445 sprintf(buffer, "&#%d;", (int)p[collpos]);
7446 for (cp = buffer; *cp; ++cp) {
7447 x = charmapencode_output(*cp, mapping, res, respos);
7448 if (x==enc_EXCEPTION)
7449 return -1;
7450 else if (x==enc_FAILED) {
7451 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7452 return -1;
7453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007454 }
7455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007456 *inpos = collendpos;
7457 break;
7458 default:
7459 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 encoding, reason, p, size, exceptionObject,
7461 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007462 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007464 if (PyBytes_Check(repunicode)) {
7465 /* Directly copy bytes result to output. */
7466 Py_ssize_t outsize = PyBytes_Size(*res);
7467 Py_ssize_t requiredsize;
7468 repsize = PyBytes_Size(repunicode);
7469 requiredsize = *respos + repsize;
7470 if (requiredsize > outsize)
7471 /* Make room for all additional bytes. */
7472 if (charmapencode_resize(res, respos, requiredsize)) {
7473 Py_DECREF(repunicode);
7474 return -1;
7475 }
7476 memcpy(PyBytes_AsString(*res) + *respos,
7477 PyBytes_AsString(repunicode), repsize);
7478 *respos += repsize;
7479 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007480 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007481 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 /* generate replacement */
7484 repsize = PyUnicode_GET_SIZE(repunicode);
7485 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 x = charmapencode_output(*uni2, mapping, res, respos);
7487 if (x==enc_EXCEPTION) {
7488 return -1;
7489 }
7490 else if (x==enc_FAILED) {
7491 Py_DECREF(repunicode);
7492 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7493 return -1;
7494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 }
7496 *inpos = newpos;
7497 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498 }
7499 return 0;
7500}
7501
Alexander Belopolsky40018472011-02-26 01:02:56 +00007502PyObject *
7503PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7504 Py_ssize_t size,
7505 PyObject *mapping,
7506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 /* output object */
7509 PyObject *res = NULL;
7510 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007514 PyObject *errorHandler = NULL;
7515 PyObject *exc = NULL;
7516 /* the following variable is used for caching string comparisons
7517 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7518 * 3=ignore, 4=xmlcharrefreplace */
7519 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520
7521 /* Default to Latin-1 */
7522 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007525 /* allocate enough for a simple encoding without
7526 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007527 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528 if (res == NULL)
7529 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007530 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007533 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 /* try to encode it */
7535 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7536 if (x==enc_EXCEPTION) /* error */
7537 goto onError;
7538 if (x==enc_FAILED) { /* unencodable character */
7539 if (charmap_encoding_error(p, size, &inpos, mapping,
7540 &exc,
7541 &known_errorHandler, &errorHandler, errors,
7542 &res, &respos)) {
7543 goto onError;
7544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 else
7547 /* done with this character => adjust input position */
7548 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007552 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007553 if (_PyBytes_Resize(&res, respos) < 0)
7554 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 Py_XDECREF(exc);
7557 Py_XDECREF(errorHandler);
7558 return res;
7559
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 Py_XDECREF(res);
7562 Py_XDECREF(exc);
7563 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565}
7566
Alexander Belopolsky40018472011-02-26 01:02:56 +00007567PyObject *
7568PyUnicode_AsCharmapString(PyObject *unicode,
7569 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570{
7571 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 PyErr_BadArgument();
7573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 }
7575 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 PyUnicode_GET_SIZE(unicode),
7577 mapping,
7578 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579}
7580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007582static void
7583make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007584 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585 Py_ssize_t startpos, Py_ssize_t endpos,
7586 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 *exceptionObject = _PyUnicodeTranslateError_Create(
7590 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 }
7592 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7594 goto onError;
7595 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7596 goto onError;
7597 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7598 goto onError;
7599 return;
7600 onError:
7601 Py_DECREF(*exceptionObject);
7602 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 }
7604}
7605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007606/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007607static void
7608raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007610 Py_ssize_t startpos, Py_ssize_t endpos,
7611 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612{
7613 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617}
7618
7619/* error handling callback helper:
7620 build arguments, call the callback and check the arguments,
7621 put the result into newpos and return the replacement string, which
7622 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007623static PyObject *
7624unicode_translate_call_errorhandler(const char *errors,
7625 PyObject **errorHandler,
7626 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007628 Py_ssize_t startpos, Py_ssize_t endpos,
7629 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007631 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007633 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 PyObject *restuple;
7635 PyObject *resunicode;
7636
7637 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 }
7642
7643 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007644 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647
7648 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007652 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007653 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 Py_DECREF(restuple);
7655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656 }
7657 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 &resunicode, &i_newpos)) {
7659 Py_DECREF(restuple);
7660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007662 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 else
7665 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7668 Py_DECREF(restuple);
7669 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 Py_INCREF(resunicode);
7672 Py_DECREF(restuple);
7673 return resunicode;
7674}
7675
7676/* Lookup the character ch in the mapping and put the result in result,
7677 which must be decrefed by the caller.
7678 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007679static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681{
Christian Heimes217cfd12007-12-02 14:31:20 +00007682 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683 PyObject *x;
7684
7685 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007687 x = PyObject_GetItem(mapping, w);
7688 Py_DECREF(w);
7689 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7691 /* No mapping found means: use 1:1 mapping. */
7692 PyErr_Clear();
7693 *result = NULL;
7694 return 0;
7695 } else
7696 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 }
7698 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 *result = x;
7700 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007702 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 long value = PyLong_AS_LONG(x);
7704 long max = PyUnicode_GetMax();
7705 if (value < 0 || value > max) {
7706 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007707 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 Py_DECREF(x);
7709 return -1;
7710 }
7711 *result = x;
7712 return 0;
7713 }
7714 else if (PyUnicode_Check(x)) {
7715 *result = x;
7716 return 0;
7717 }
7718 else {
7719 /* wrong return value */
7720 PyErr_SetString(PyExc_TypeError,
7721 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 Py_DECREF(x);
7723 return -1;
7724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725}
7726/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 if not reallocate and adjust various state variables.
7728 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007729static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007734 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 /* exponentially overallocate to minimize reallocations */
7736 if (requiredsize < 2 * oldsize)
7737 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7739 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007741 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742 }
7743 return 0;
7744}
7745/* lookup the character, put the result in the output string and adjust
7746 various state variables. Return a new reference to the object that
7747 was put in the output buffer in *result, or Py_None, if the mapping was
7748 undefined (in which case no character was written).
7749 The called must decref result.
7750 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007751static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7753 PyObject *mapping, Py_UCS4 **output,
7754 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007755 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7758 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007762 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 }
7764 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007766 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769 }
7770 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 Py_ssize_t repsize;
7772 if (PyUnicode_READY(*res) == -1)
7773 return -1;
7774 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 if (repsize==1) {
7776 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 }
7779 else if (repsize!=0) {
7780 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007781 Py_ssize_t requiredsize = *opos +
7782 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 Py_ssize_t i;
7785 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 for(i = 0; i < repsize; i++)
7788 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790 }
7791 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007793 return 0;
7794}
7795
Alexander Belopolsky40018472011-02-26 01:02:56 +00007796PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797_PyUnicode_TranslateCharmap(PyObject *input,
7798 PyObject *mapping,
7799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007801 /* input object */
7802 char *idata;
7803 Py_ssize_t size, i;
7804 int kind;
7805 /* output buffer */
7806 Py_UCS4 *output = NULL;
7807 Py_ssize_t osize;
7808 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 char *reason = "character maps to <undefined>";
7812 PyObject *errorHandler = NULL;
7813 PyObject *exc = NULL;
7814 /* the following variable is used for caching string comparisons
7815 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7816 * 3=ignore, 4=xmlcharrefreplace */
7817 int known_errorHandler = -1;
7818
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 PyErr_BadArgument();
7821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 if (PyUnicode_READY(input) == -1)
7825 return NULL;
7826 idata = (char*)PyUnicode_DATA(input);
7827 kind = PyUnicode_KIND(input);
7828 size = PyUnicode_GET_LENGTH(input);
7829 i = 0;
7830
7831 if (size == 0) {
7832 Py_INCREF(input);
7833 return input;
7834 }
7835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836 /* allocate enough for a simple 1:1 translation without
7837 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 osize = size;
7839 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7840 opos = 0;
7841 if (output == NULL) {
7842 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 /* try to encode it */
7848 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849 if (charmaptranslate_output(input, i, mapping,
7850 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 Py_XDECREF(x);
7852 goto onError;
7853 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 else { /* untranslatable character */
7858 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7859 Py_ssize_t repsize;
7860 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 Py_ssize_t collstart = i;
7864 Py_ssize_t collend = i+1;
7865 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 while (collend < size) {
7869 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 goto onError;
7871 Py_XDECREF(x);
7872 if (x!=Py_None)
7873 break;
7874 ++collend;
7875 }
7876 /* cache callback name lookup
7877 * (if not done yet, i.e. it's the first error) */
7878 if (known_errorHandler==-1) {
7879 if ((errors==NULL) || (!strcmp(errors, "strict")))
7880 known_errorHandler = 1;
7881 else if (!strcmp(errors, "replace"))
7882 known_errorHandler = 2;
7883 else if (!strcmp(errors, "ignore"))
7884 known_errorHandler = 3;
7885 else if (!strcmp(errors, "xmlcharrefreplace"))
7886 known_errorHandler = 4;
7887 else
7888 known_errorHandler = 0;
7889 }
7890 switch (known_errorHandler) {
7891 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 raise_translate_exception(&exc, input, collstart,
7893 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 case 2: /* replace */
7896 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 for (coll = collstart; coll<collend; coll++)
7898 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 /* fall through */
7900 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 break;
7903 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 /* generate replacement (temporarily (mis)uses i) */
7905 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 char buffer[2+29+1+1];
7907 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7909 if (charmaptranslate_makespace(&output, &osize,
7910 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 goto onError;
7912 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 break;
7917 default:
7918 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 reason, input, &exc,
7920 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007921 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 goto onError;
7923 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 repsize = PyUnicode_GET_LENGTH(repunicode);
7925 if (charmaptranslate_makespace(&output, &osize,
7926 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 Py_DECREF(repunicode);
7928 goto onError;
7929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 for (uni2 = 0; repsize-->0; ++uni2)
7931 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7932 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 }
7936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7938 if (!res)
7939 goto onError;
7940 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 Py_XDECREF(exc);
7942 Py_XDECREF(errorHandler);
7943 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 Py_XDECREF(exc);
7948 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 return NULL;
7950}
7951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007952/* Deprecated. Use PyUnicode_Translate instead. */
7953PyObject *
7954PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7955 Py_ssize_t size,
7956 PyObject *mapping,
7957 const char *errors)
7958{
7959 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7960 if (!unicode)
7961 return NULL;
7962 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7963}
7964
Alexander Belopolsky40018472011-02-26 01:02:56 +00007965PyObject *
7966PyUnicode_Translate(PyObject *str,
7967 PyObject *mapping,
7968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969{
7970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 str = PyUnicode_FromObject(str);
7973 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 Py_DECREF(str);
7977 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007978
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 Py_XDECREF(str);
7981 return NULL;
7982}
Tim Petersced69f82003-09-16 20:30:58 +00007983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984static Py_UCS4
7985fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7986{
7987 /* No need to call PyUnicode_READY(self) because this function is only
7988 called as a callback from fixup() which does it already. */
7989 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7990 const int kind = PyUnicode_KIND(self);
7991 void *data = PyUnicode_DATA(self);
7992 Py_UCS4 maxchar = 0, ch, fixed;
7993 Py_ssize_t i;
7994
7995 for (i = 0; i < len; ++i) {
7996 ch = PyUnicode_READ(kind, data, i);
7997 fixed = 0;
7998 if (ch > 127) {
7999 if (Py_UNICODE_ISSPACE(ch))
8000 fixed = ' ';
8001 else {
8002 const int decimal = Py_UNICODE_TODECIMAL(ch);
8003 if (decimal >= 0)
8004 fixed = '0' + decimal;
8005 }
8006 if (fixed != 0) {
8007 if (fixed > maxchar)
8008 maxchar = fixed;
8009 PyUnicode_WRITE(kind, data, i, fixed);
8010 }
8011 else if (ch > maxchar)
8012 maxchar = ch;
8013 }
8014 else if (ch > maxchar)
8015 maxchar = ch;
8016 }
8017
8018 return maxchar;
8019}
8020
8021PyObject *
8022_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8023{
8024 if (!PyUnicode_Check(unicode)) {
8025 PyErr_BadInternalCall();
8026 return NULL;
8027 }
8028 if (PyUnicode_READY(unicode) == -1)
8029 return NULL;
8030 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8031 /* If the string is already ASCII, just return the same string */
8032 Py_INCREF(unicode);
8033 return unicode;
8034 }
8035 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8036}
8037
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008038PyObject *
8039PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8040 Py_ssize_t length)
8041{
8042 PyObject *result;
8043 Py_UNICODE *p; /* write pointer into result */
8044 Py_ssize_t i;
8045 /* Copy to a new string */
8046 result = (PyObject *)_PyUnicode_New(length);
8047 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8048 if (result == NULL)
8049 return result;
8050 p = PyUnicode_AS_UNICODE(result);
8051 /* Iterate over code points */
8052 for (i = 0; i < length; i++) {
8053 Py_UNICODE ch =s[i];
8054 if (ch > 127) {
8055 int decimal = Py_UNICODE_TODECIMAL(ch);
8056 if (decimal >= 0)
8057 p[i] = '0' + decimal;
8058 }
8059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008060 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8061 Py_DECREF(result);
8062 return NULL;
8063 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008064 return result;
8065}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008066/* --- Decimal Encoder ---------------------------------------------------- */
8067
Alexander Belopolsky40018472011-02-26 01:02:56 +00008068int
8069PyUnicode_EncodeDecimal(Py_UNICODE *s,
8070 Py_ssize_t length,
8071 char *output,
8072 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008073{
8074 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 PyObject *errorHandler = NULL;
8076 PyObject *exc = NULL;
8077 const char *encoding = "decimal";
8078 const char *reason = "invalid decimal Unicode string";
8079 /* the following variable is used for caching string comparisons
8080 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8081 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008082
8083 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 PyErr_BadArgument();
8085 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008086 }
8087
8088 p = s;
8089 end = s + length;
8090 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 register Py_UNICODE ch = *p;
8092 int decimal;
8093 PyObject *repunicode;
8094 Py_ssize_t repsize;
8095 Py_ssize_t newpos;
8096 Py_UNICODE *uni2;
8097 Py_UNICODE *collstart;
8098 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008099
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 ++p;
8103 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 decimal = Py_UNICODE_TODECIMAL(ch);
8106 if (decimal >= 0) {
8107 *output++ = '0' + decimal;
8108 ++p;
8109 continue;
8110 }
8111 if (0 < ch && ch < 256) {
8112 *output++ = (char)ch;
8113 ++p;
8114 continue;
8115 }
8116 /* All other characters are considered unencodable */
8117 collstart = p;
8118 collend = p+1;
8119 while (collend < end) {
8120 if ((0 < *collend && *collend < 256) ||
8121 !Py_UNICODE_ISSPACE(*collend) ||
8122 Py_UNICODE_TODECIMAL(*collend))
8123 break;
8124 }
8125 /* cache callback name lookup
8126 * (if not done yet, i.e. it's the first error) */
8127 if (known_errorHandler==-1) {
8128 if ((errors==NULL) || (!strcmp(errors, "strict")))
8129 known_errorHandler = 1;
8130 else if (!strcmp(errors, "replace"))
8131 known_errorHandler = 2;
8132 else if (!strcmp(errors, "ignore"))
8133 known_errorHandler = 3;
8134 else if (!strcmp(errors, "xmlcharrefreplace"))
8135 known_errorHandler = 4;
8136 else
8137 known_errorHandler = 0;
8138 }
8139 switch (known_errorHandler) {
8140 case 1: /* strict */
8141 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8142 goto onError;
8143 case 2: /* replace */
8144 for (p = collstart; p < collend; ++p)
8145 *output++ = '?';
8146 /* fall through */
8147 case 3: /* ignore */
8148 p = collend;
8149 break;
8150 case 4: /* xmlcharrefreplace */
8151 /* generate replacement (temporarily (mis)uses p) */
8152 for (p = collstart; p < collend; ++p)
8153 output += sprintf(output, "&#%d;", (int)*p);
8154 p = collend;
8155 break;
8156 default:
8157 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8158 encoding, reason, s, length, &exc,
8159 collstart-s, collend-s, &newpos);
8160 if (repunicode == NULL)
8161 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008162 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008163 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008164 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8165 Py_DECREF(repunicode);
8166 goto onError;
8167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 /* generate replacement */
8169 repsize = PyUnicode_GET_SIZE(repunicode);
8170 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8171 Py_UNICODE ch = *uni2;
8172 if (Py_UNICODE_ISSPACE(ch))
8173 *output++ = ' ';
8174 else {
8175 decimal = Py_UNICODE_TODECIMAL(ch);
8176 if (decimal >= 0)
8177 *output++ = '0' + decimal;
8178 else if (0 < ch && ch < 256)
8179 *output++ = (char)ch;
8180 else {
8181 Py_DECREF(repunicode);
8182 raise_encode_exception(&exc, encoding,
8183 s, length, collstart-s, collend-s, reason);
8184 goto onError;
8185 }
8186 }
8187 }
8188 p = s + newpos;
8189 Py_DECREF(repunicode);
8190 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008191 }
8192 /* 0-terminate the output string */
8193 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 Py_XDECREF(exc);
8195 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008196 return 0;
8197
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 Py_XDECREF(exc);
8200 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008201 return -1;
8202}
8203
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204/* --- Helpers ------------------------------------------------------------ */
8205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206#include "stringlib/ucs1lib.h"
8207#include "stringlib/fastsearch.h"
8208#include "stringlib/partition.h"
8209#include "stringlib/split.h"
8210#include "stringlib/count.h"
8211#include "stringlib/find.h"
8212#include "stringlib/localeutil.h"
8213#include "stringlib/undef.h"
8214
8215#include "stringlib/ucs2lib.h"
8216#include "stringlib/fastsearch.h"
8217#include "stringlib/partition.h"
8218#include "stringlib/split.h"
8219#include "stringlib/count.h"
8220#include "stringlib/find.h"
8221#include "stringlib/localeutil.h"
8222#include "stringlib/undef.h"
8223
8224#include "stringlib/ucs4lib.h"
8225#include "stringlib/fastsearch.h"
8226#include "stringlib/partition.h"
8227#include "stringlib/split.h"
8228#include "stringlib/count.h"
8229#include "stringlib/find.h"
8230#include "stringlib/localeutil.h"
8231#include "stringlib/undef.h"
8232
8233static Py_ssize_t
8234any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8235 const Py_UCS1*, Py_ssize_t,
8236 Py_ssize_t, Py_ssize_t),
8237 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8238 const Py_UCS2*, Py_ssize_t,
8239 Py_ssize_t, Py_ssize_t),
8240 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8241 const Py_UCS4*, Py_ssize_t,
8242 Py_ssize_t, Py_ssize_t),
8243 PyObject* s1, PyObject* s2,
8244 Py_ssize_t start,
8245 Py_ssize_t end)
8246{
8247 int kind1, kind2, kind;
8248 void *buf1, *buf2;
8249 Py_ssize_t len1, len2, result;
8250
8251 kind1 = PyUnicode_KIND(s1);
8252 kind2 = PyUnicode_KIND(s2);
8253 kind = kind1 > kind2 ? kind1 : kind2;
8254 buf1 = PyUnicode_DATA(s1);
8255 buf2 = PyUnicode_DATA(s2);
8256 if (kind1 != kind)
8257 buf1 = _PyUnicode_AsKind(s1, kind);
8258 if (!buf1)
8259 return -2;
8260 if (kind2 != kind)
8261 buf2 = _PyUnicode_AsKind(s2, kind);
8262 if (!buf2) {
8263 if (kind1 != kind) PyMem_Free(buf1);
8264 return -2;
8265 }
8266 len1 = PyUnicode_GET_LENGTH(s1);
8267 len2 = PyUnicode_GET_LENGTH(s2);
8268
8269 switch(kind) {
8270 case PyUnicode_1BYTE_KIND:
8271 result = ucs1(buf1, len1, buf2, len2, start, end);
8272 break;
8273 case PyUnicode_2BYTE_KIND:
8274 result = ucs2(buf1, len1, buf2, len2, start, end);
8275 break;
8276 case PyUnicode_4BYTE_KIND:
8277 result = ucs4(buf1, len1, buf2, len2, start, end);
8278 break;
8279 default:
8280 assert(0); result = -2;
8281 }
8282
8283 if (kind1 != kind)
8284 PyMem_Free(buf1);
8285 if (kind2 != kind)
8286 PyMem_Free(buf2);
8287
8288 return result;
8289}
8290
8291Py_ssize_t
8292_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8293 Py_ssize_t n_buffer,
8294 void *digits, Py_ssize_t n_digits,
8295 Py_ssize_t min_width,
8296 const char *grouping,
8297 const char *thousands_sep)
8298{
8299 switch(kind) {
8300 case PyUnicode_1BYTE_KIND:
8301 return _PyUnicode_ucs1_InsertThousandsGrouping(
8302 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8303 min_width, grouping, thousands_sep);
8304 case PyUnicode_2BYTE_KIND:
8305 return _PyUnicode_ucs2_InsertThousandsGrouping(
8306 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8307 min_width, grouping, thousands_sep);
8308 case PyUnicode_4BYTE_KIND:
8309 return _PyUnicode_ucs4_InsertThousandsGrouping(
8310 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8311 min_width, grouping, thousands_sep);
8312 }
8313 assert(0);
8314 return -1;
8315}
8316
8317
Eric Smith8c663262007-08-25 02:26:07 +00008318#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008320
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321#include "stringlib/count.h"
8322#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008323
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008325#define ADJUST_INDICES(start, end, len) \
8326 if (end > len) \
8327 end = len; \
8328 else if (end < 0) { \
8329 end += len; \
8330 if (end < 0) \
8331 end = 0; \
8332 } \
8333 if (start < 0) { \
8334 start += len; \
8335 if (start < 0) \
8336 start = 0; \
8337 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339Py_ssize_t
8340PyUnicode_Count(PyObject *str,
8341 PyObject *substr,
8342 Py_ssize_t start,
8343 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346 PyUnicodeObject* str_obj;
8347 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 int kind1, kind2, kind;
8349 void *buf1 = NULL, *buf2 = NULL;
8350 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008351
Thomas Wouters477c8d52006-05-27 19:21:47 +00008352 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008356 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 Py_DECREF(str_obj);
8358 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
Tim Petersced69f82003-09-16 20:30:58 +00008360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 kind1 = PyUnicode_KIND(str_obj);
8362 kind2 = PyUnicode_KIND(sub_obj);
8363 kind = kind1 > kind2 ? kind1 : kind2;
8364 buf1 = PyUnicode_DATA(str_obj);
8365 if (kind1 != kind)
8366 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8367 if (!buf1)
8368 goto onError;
8369 buf2 = PyUnicode_DATA(sub_obj);
8370 if (kind2 != kind)
8371 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8372 if (!buf2)
8373 goto onError;
8374 len1 = PyUnicode_GET_LENGTH(str_obj);
8375 len2 = PyUnicode_GET_LENGTH(sub_obj);
8376
8377 ADJUST_INDICES(start, end, len1);
8378 switch(kind) {
8379 case PyUnicode_1BYTE_KIND:
8380 result = ucs1lib_count(
8381 ((Py_UCS1*)buf1) + start, end - start,
8382 buf2, len2, PY_SSIZE_T_MAX
8383 );
8384 break;
8385 case PyUnicode_2BYTE_KIND:
8386 result = ucs2lib_count(
8387 ((Py_UCS2*)buf1) + start, end - start,
8388 buf2, len2, PY_SSIZE_T_MAX
8389 );
8390 break;
8391 case PyUnicode_4BYTE_KIND:
8392 result = ucs4lib_count(
8393 ((Py_UCS4*)buf1) + start, end - start,
8394 buf2, len2, PY_SSIZE_T_MAX
8395 );
8396 break;
8397 default:
8398 assert(0); result = 0;
8399 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008400
8401 Py_DECREF(sub_obj);
8402 Py_DECREF(str_obj);
8403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 if (kind1 != kind)
8405 PyMem_Free(buf1);
8406 if (kind2 != kind)
8407 PyMem_Free(buf2);
8408
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 onError:
8411 Py_DECREF(sub_obj);
8412 Py_DECREF(str_obj);
8413 if (kind1 != kind && buf1)
8414 PyMem_Free(buf1);
8415 if (kind2 != kind && buf2)
8416 PyMem_Free(buf2);
8417 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418}
8419
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420Py_ssize_t
8421PyUnicode_Find(PyObject *str,
8422 PyObject *sub,
8423 Py_ssize_t start,
8424 Py_ssize_t end,
8425 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008432 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 Py_DECREF(str);
8435 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 }
Tim Petersced69f82003-09-16 20:30:58 +00008437
Thomas Wouters477c8d52006-05-27 19:21:47 +00008438 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 result = any_find_slice(
8440 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8441 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008442 );
8443 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 result = any_find_slice(
8445 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8446 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008447 );
8448
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008450 Py_DECREF(sub);
8451
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 return result;
8453}
8454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455Py_ssize_t
8456PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8457 Py_ssize_t start, Py_ssize_t end,
8458 int direction)
8459{
8460 char *result;
8461 int kind;
8462 if (PyUnicode_READY(str) == -1)
8463 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008464 if (start < 0 || end < 0) {
8465 PyErr_SetString(PyExc_IndexError, "string index out of range");
8466 return -2;
8467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 if (end > PyUnicode_GET_LENGTH(str))
8469 end = PyUnicode_GET_LENGTH(str);
8470 kind = PyUnicode_KIND(str);
8471 result = findchar(PyUnicode_1BYTE_DATA(str)
8472 + PyUnicode_KIND_SIZE(kind, start),
8473 kind,
8474 end-start, ch, direction);
8475 if (!result)
8476 return -1;
8477 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8478}
8479
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480static int
8481tailmatch(PyUnicodeObject *self,
8482 PyUnicodeObject *substring,
8483 Py_ssize_t start,
8484 Py_ssize_t end,
8485 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 int kind_self;
8488 int kind_sub;
8489 void *data_self;
8490 void *data_sub;
8491 Py_ssize_t offset;
8492 Py_ssize_t i;
8493 Py_ssize_t end_sub;
8494
8495 if (PyUnicode_READY(self) == -1 ||
8496 PyUnicode_READY(substring) == -1)
8497 return 0;
8498
8499 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 return 1;
8501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8503 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 kind_self = PyUnicode_KIND(self);
8508 data_self = PyUnicode_DATA(self);
8509 kind_sub = PyUnicode_KIND(substring);
8510 data_sub = PyUnicode_DATA(substring);
8511 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8512
8513 if (direction > 0)
8514 offset = end;
8515 else
8516 offset = start;
8517
8518 if (PyUnicode_READ(kind_self, data_self, offset) ==
8519 PyUnicode_READ(kind_sub, data_sub, 0) &&
8520 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8521 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8522 /* If both are of the same kind, memcmp is sufficient */
8523 if (kind_self == kind_sub) {
8524 return ! memcmp((char *)data_self +
8525 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8526 data_sub,
8527 PyUnicode_GET_LENGTH(substring) *
8528 PyUnicode_CHARACTER_SIZE(substring));
8529 }
8530 /* otherwise we have to compare each character by first accesing it */
8531 else {
8532 /* We do not need to compare 0 and len(substring)-1 because
8533 the if statement above ensured already that they are equal
8534 when we end up here. */
8535 // TODO: honor direction and do a forward or backwards search
8536 for (i = 1; i < end_sub; ++i) {
8537 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8538 PyUnicode_READ(kind_sub, data_sub, i))
8539 return 0;
8540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 }
8544
8545 return 0;
8546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548Py_ssize_t
8549PyUnicode_Tailmatch(PyObject *str,
8550 PyObject *substr,
8551 Py_ssize_t start,
8552 Py_ssize_t end,
8553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008555 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 str = PyUnicode_FromObject(str);
8558 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560 substr = PyUnicode_FromObject(substr);
8561 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 Py_DECREF(str);
8563 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Tim Petersced69f82003-09-16 20:30:58 +00008565
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 (PyUnicodeObject *)substr,
8568 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 Py_DECREF(str);
8570 Py_DECREF(substr);
8571 return result;
8572}
8573
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574/* Apply fixfct filter to the Unicode object self and return a
8575 reference to the modified object */
8576
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577static PyObject *
8578fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 PyObject *u;
8582 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 if (PyUnicode_READY(self) == -1)
8585 return NULL;
8586 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8587 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8588 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8593 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 /* fix functions return the new maximum character in a string,
8596 if the kind of the resulting unicode object does not change,
8597 everything is fine. Otherwise we need to change the string kind
8598 and re-run the fix function. */
8599 maxchar_new = fixfct((PyUnicodeObject*)u);
8600 if (maxchar_new == 0)
8601 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8602 else if (maxchar_new <= 127)
8603 maxchar_new = 127;
8604 else if (maxchar_new <= 255)
8605 maxchar_new = 255;
8606 else if (maxchar_new <= 65535)
8607 maxchar_new = 65535;
8608 else
8609 maxchar_new = 1114111; /* 0x10ffff */
8610
8611 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 /* fixfct should return TRUE if it modified the buffer. If
8613 FALSE, return a reference to the original buffer instead
8614 (to save space, not time) */
8615 Py_INCREF(self);
8616 Py_DECREF(u);
8617 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 else if (maxchar_new == maxchar_old) {
8620 return u;
8621 }
8622 else {
8623 /* In case the maximum character changed, we need to
8624 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008625 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (v == NULL) {
8627 Py_DECREF(u);
8628 return NULL;
8629 }
8630 if (maxchar_new > maxchar_old) {
8631 /* If the maxchar increased so that the kind changed, not all
8632 characters are representable anymore and we need to fix the
8633 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008634 if (PyUnicode_CopyCharacters(v, 0,
8635 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008636 PyUnicode_GET_LENGTH(self)) < 0)
8637 {
8638 Py_DECREF(u);
8639 return NULL;
8640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 maxchar_old = fixfct((PyUnicodeObject*)v);
8642 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8643 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008644 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008645 if (PyUnicode_CopyCharacters(v, 0,
8646 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008647 PyUnicode_GET_LENGTH(self)) < 0)
8648 {
8649 Py_DECREF(u);
8650 return NULL;
8651 }
8652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653
8654 Py_DECREF(u);
8655 return v;
8656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 /* No need to call PyUnicode_READY(self) because this function is only
8663 called as a callback from fixup() which does it already. */
8664 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8665 const int kind = PyUnicode_KIND(self);
8666 void *data = PyUnicode_DATA(self);
8667 int touched = 0;
8668 Py_UCS4 maxchar = 0;
8669 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 for (i = 0; i < len; ++i) {
8672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8673 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8674 if (up != ch) {
8675 if (up > maxchar)
8676 maxchar = up;
8677 PyUnicode_WRITE(kind, data, i, up);
8678 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 else if (ch > maxchar)
8681 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 if (touched)
8685 return maxchar;
8686 else
8687 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688}
8689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8694 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8695 const int kind = PyUnicode_KIND(self);
8696 void *data = PyUnicode_DATA(self);
8697 int touched = 0;
8698 Py_UCS4 maxchar = 0;
8699 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 for(i = 0; i < len; ++i) {
8702 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8703 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8704 if (lo != ch) {
8705 if (lo > maxchar)
8706 maxchar = lo;
8707 PyUnicode_WRITE(kind, data, i, lo);
8708 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 else if (ch > maxchar)
8711 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 }
8713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 if (touched)
8715 return maxchar;
8716 else
8717 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718}
8719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008721fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8724 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8725 const int kind = PyUnicode_KIND(self);
8726 void *data = PyUnicode_DATA(self);
8727 int touched = 0;
8728 Py_UCS4 maxchar = 0;
8729 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 for(i = 0; i < len; ++i) {
8732 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8733 Py_UCS4 nu = 0;
8734
8735 if (Py_UNICODE_ISUPPER(ch))
8736 nu = Py_UNICODE_TOLOWER(ch);
8737 else if (Py_UNICODE_ISLOWER(ch))
8738 nu = Py_UNICODE_TOUPPER(ch);
8739
8740 if (nu != 0) {
8741 if (nu > maxchar)
8742 maxchar = nu;
8743 PyUnicode_WRITE(kind, data, i, nu);
8744 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 else if (ch > maxchar)
8747 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
8749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (touched)
8751 return maxchar;
8752 else
8753 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754}
8755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8760 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8761 const int kind = PyUnicode_KIND(self);
8762 void *data = PyUnicode_DATA(self);
8763 int touched = 0;
8764 Py_UCS4 maxchar = 0;
8765 Py_ssize_t i = 0;
8766 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008767
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008768 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770
8771 ch = PyUnicode_READ(kind, data, i);
8772 if (!Py_UNICODE_ISUPPER(ch)) {
8773 maxchar = Py_UNICODE_TOUPPER(ch);
8774 PyUnicode_WRITE(kind, data, i, maxchar);
8775 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 ++i;
8778 for(; i < len; ++i) {
8779 ch = PyUnicode_READ(kind, data, i);
8780 if (!Py_UNICODE_ISLOWER(ch)) {
8781 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8782 if (lo > maxchar)
8783 maxchar = lo;
8784 PyUnicode_WRITE(kind, data, i, lo);
8785 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 else if (ch > maxchar)
8788 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790
8791 if (touched)
8792 return maxchar;
8793 else
8794 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795}
8796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008798fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8801 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8802 const int kind = PyUnicode_KIND(self);
8803 void *data = PyUnicode_DATA(self);
8804 Py_UCS4 maxchar = 0;
8805 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 int previous_is_cased;
8807
8808 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 if (len == 1) {
8810 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8811 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8812 if (ti != ch) {
8813 PyUnicode_WRITE(kind, data, i, ti);
8814 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 }
8816 else
8817 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 for(; i < len; ++i) {
8821 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8822 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008823
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 nu = Py_UNICODE_TOTITLE(ch);
8828
8829 if (nu > maxchar)
8830 maxchar = nu;
8831 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008832
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 if (Py_UNICODE_ISLOWER(ch) ||
8834 Py_UNICODE_ISUPPER(ch) ||
8835 Py_UNICODE_ISTITLE(ch))
8836 previous_is_cased = 1;
8837 else
8838 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
Tim Peters8ce9f162004-08-27 01:49:32 +00008843PyObject *
8844PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008847 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008850 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8851 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008852 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 Py_ssize_t sz, i, res_offset;
8854 Py_UCS4 maxchar = 0;
8855 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856
Tim Peters05eba1f2004-08-27 21:32:02 +00008857 fseq = PySequence_Fast(seq, "");
8858 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008859 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008860 }
8861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008862 /* NOTE: the following code can't call back into Python code,
8863 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008864 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008865
Tim Peters05eba1f2004-08-27 21:32:02 +00008866 seqlen = PySequence_Fast_GET_SIZE(fseq);
8867 /* If empty sequence, return u"". */
8868 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008871 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008872 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008873 /* If singleton sequence with an exact Unicode, return that. */
8874 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 item = items[0];
8876 if (PyUnicode_CheckExact(item)) {
8877 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 goto Done;
8880 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008881 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008882 else {
8883 /* Set up sep and seplen */
8884 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 /* fall back to a blank space separator */
8886 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008887 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008889 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008890 else {
8891 if (!PyUnicode_Check(separator)) {
8892 PyErr_Format(PyExc_TypeError,
8893 "separator: expected str instance,"
8894 " %.80s found",
8895 Py_TYPE(separator)->tp_name);
8896 goto onError;
8897 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008898 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 goto onError;
8900 sep = separator;
8901 seplen = PyUnicode_GET_LENGTH(separator);
8902 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8903 /* inc refcount to keep this code path symetric with the
8904 above case of a blank separator */
8905 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008906 }
8907 }
8908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008909 /* There are at least two things to join, or else we have a subclass
8910 * of str in the sequence.
8911 * Do a pre-pass to figure out the total amount of space we'll
8912 * need (sz), and see whether all argument are strings.
8913 */
8914 sz = 0;
8915 for (i = 0; i < seqlen; i++) {
8916 const Py_ssize_t old_sz = sz;
8917 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 if (!PyUnicode_Check(item)) {
8919 PyErr_Format(PyExc_TypeError,
8920 "sequence item %zd: expected str instance,"
8921 " %.80s found",
8922 i, Py_TYPE(item)->tp_name);
8923 goto onError;
8924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 if (PyUnicode_READY(item) == -1)
8926 goto onError;
8927 sz += PyUnicode_GET_LENGTH(item);
8928 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8929 if (item_maxchar > maxchar)
8930 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008931 if (i != 0)
8932 sz += seplen;
8933 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8934 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008936 goto onError;
8937 }
8938 }
Tim Petersced69f82003-09-16 20:30:58 +00008939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008941 if (res == NULL)
8942 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008943
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008944 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008946 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008947 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008949 if (i && seplen != 0) {
8950 copied = PyUnicode_CopyCharacters(res, res_offset,
8951 sep, 0, seplen);
8952 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008953 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008954#ifdef Py_DEBUG
8955 res_offset += copied;
8956#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008958#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008960 itemlen = PyUnicode_GET_LENGTH(item);
8961 if (itemlen != 0) {
8962 copied = PyUnicode_CopyCharacters(res, res_offset,
8963 item, 0, itemlen);
8964 if (copied < 0)
8965 goto onError;
8966#ifdef Py_DEBUG
8967 res_offset += copied;
8968#else
8969 res_offset += itemlen;
8970#endif
8971 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008974
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008976 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 Py_XDECREF(sep);
8978 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008981 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008983 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 return NULL;
8985}
8986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987#define FILL(kind, data, value, start, length) \
8988 do { \
8989 Py_ssize_t i_ = 0; \
8990 assert(kind != PyUnicode_WCHAR_KIND); \
8991 switch ((kind)) { \
8992 case PyUnicode_1BYTE_KIND: { \
8993 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8994 memset(to_, (unsigned char)value, length); \
8995 break; \
8996 } \
8997 case PyUnicode_2BYTE_KIND: { \
8998 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8999 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9000 break; \
9001 } \
9002 default: { \
9003 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9004 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9005 break; \
9006 } \
9007 } \
9008 } while (0)
9009
Alexander Belopolsky40018472011-02-26 01:02:56 +00009010static PyUnicodeObject *
9011pad(PyUnicodeObject *self,
9012 Py_ssize_t left,
9013 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 PyObject *u;
9017 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009018 int kind;
9019 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
9021 if (left < 0)
9022 left = 0;
9023 if (right < 0)
9024 right = 0;
9025
Tim Peters7a29bd52001-09-12 03:03:31 +00009026 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 Py_INCREF(self);
9028 return self;
9029 }
9030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9032 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009033 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9034 return NULL;
9035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9037 if (fill > maxchar)
9038 maxchar = fill;
9039 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009040 if (!u)
9041 return NULL;
9042
9043 kind = PyUnicode_KIND(u);
9044 data = PyUnicode_DATA(u);
9045 if (left)
9046 FILL(kind, data, fill, 0, left);
9047 if (right)
9048 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009049 if (PyUnicode_CopyCharacters(u, left,
9050 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009051 _PyUnicode_LENGTH(self)) < 0)
9052 {
9053 Py_DECREF(u);
9054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 }
9056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Alexander Belopolsky40018472011-02-26 01:02:56 +00009061PyObject *
9062PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065
9066 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 switch(PyUnicode_KIND(string)) {
9071 case PyUnicode_1BYTE_KIND:
9072 list = ucs1lib_splitlines(
9073 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9074 PyUnicode_GET_LENGTH(string), keepends);
9075 break;
9076 case PyUnicode_2BYTE_KIND:
9077 list = ucs2lib_splitlines(
9078 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9079 PyUnicode_GET_LENGTH(string), keepends);
9080 break;
9081 case PyUnicode_4BYTE_KIND:
9082 list = ucs4lib_splitlines(
9083 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9084 PyUnicode_GET_LENGTH(string), keepends);
9085 break;
9086 default:
9087 assert(0);
9088 list = 0;
9089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 Py_DECREF(string);
9091 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
9093
Alexander Belopolsky40018472011-02-26 01:02:56 +00009094static PyObject *
9095split(PyUnicodeObject *self,
9096 PyUnicodeObject *substring,
9097 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 int kind1, kind2, kind;
9100 void *buf1, *buf2;
9101 Py_ssize_t len1, len2;
9102 PyObject* out;
9103
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009105 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 if (PyUnicode_READY(self) == -1)
9108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 if (substring == NULL)
9111 switch(PyUnicode_KIND(self)) {
9112 case PyUnicode_1BYTE_KIND:
9113 return ucs1lib_split_whitespace(
9114 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9115 PyUnicode_GET_LENGTH(self), maxcount
9116 );
9117 case PyUnicode_2BYTE_KIND:
9118 return ucs2lib_split_whitespace(
9119 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9120 PyUnicode_GET_LENGTH(self), maxcount
9121 );
9122 case PyUnicode_4BYTE_KIND:
9123 return ucs4lib_split_whitespace(
9124 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9125 PyUnicode_GET_LENGTH(self), maxcount
9126 );
9127 default:
9128 assert(0);
9129 return NULL;
9130 }
9131
9132 if (PyUnicode_READY(substring) == -1)
9133 return NULL;
9134
9135 kind1 = PyUnicode_KIND(self);
9136 kind2 = PyUnicode_KIND(substring);
9137 kind = kind1 > kind2 ? kind1 : kind2;
9138 buf1 = PyUnicode_DATA(self);
9139 buf2 = PyUnicode_DATA(substring);
9140 if (kind1 != kind)
9141 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9142 if (!buf1)
9143 return NULL;
9144 if (kind2 != kind)
9145 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9146 if (!buf2) {
9147 if (kind1 != kind) PyMem_Free(buf1);
9148 return NULL;
9149 }
9150 len1 = PyUnicode_GET_LENGTH(self);
9151 len2 = PyUnicode_GET_LENGTH(substring);
9152
9153 switch(kind) {
9154 case PyUnicode_1BYTE_KIND:
9155 out = ucs1lib_split(
9156 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9157 break;
9158 case PyUnicode_2BYTE_KIND:
9159 out = ucs2lib_split(
9160 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9161 break;
9162 case PyUnicode_4BYTE_KIND:
9163 out = ucs4lib_split(
9164 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9165 break;
9166 default:
9167 out = NULL;
9168 }
9169 if (kind1 != kind)
9170 PyMem_Free(buf1);
9171 if (kind2 != kind)
9172 PyMem_Free(buf2);
9173 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174}
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
9177rsplit(PyUnicodeObject *self,
9178 PyUnicodeObject *substring,
9179 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 int kind1, kind2, kind;
9182 void *buf1, *buf2;
9183 Py_ssize_t len1, len2;
9184 PyObject* out;
9185
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009186 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009187 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (PyUnicode_READY(self) == -1)
9190 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 if (substring == NULL)
9193 switch(PyUnicode_KIND(self)) {
9194 case PyUnicode_1BYTE_KIND:
9195 return ucs1lib_rsplit_whitespace(
9196 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9197 PyUnicode_GET_LENGTH(self), maxcount
9198 );
9199 case PyUnicode_2BYTE_KIND:
9200 return ucs2lib_rsplit_whitespace(
9201 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9202 PyUnicode_GET_LENGTH(self), maxcount
9203 );
9204 case PyUnicode_4BYTE_KIND:
9205 return ucs4lib_rsplit_whitespace(
9206 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9207 PyUnicode_GET_LENGTH(self), maxcount
9208 );
9209 default:
9210 assert(0);
9211 return NULL;
9212 }
9213
9214 if (PyUnicode_READY(substring) == -1)
9215 return NULL;
9216
9217 kind1 = PyUnicode_KIND(self);
9218 kind2 = PyUnicode_KIND(substring);
9219 kind = kind1 > kind2 ? kind1 : kind2;
9220 buf1 = PyUnicode_DATA(self);
9221 buf2 = PyUnicode_DATA(substring);
9222 if (kind1 != kind)
9223 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9224 if (!buf1)
9225 return NULL;
9226 if (kind2 != kind)
9227 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9228 if (!buf2) {
9229 if (kind1 != kind) PyMem_Free(buf1);
9230 return NULL;
9231 }
9232 len1 = PyUnicode_GET_LENGTH(self);
9233 len2 = PyUnicode_GET_LENGTH(substring);
9234
9235 switch(kind) {
9236 case PyUnicode_1BYTE_KIND:
9237 out = ucs1lib_rsplit(
9238 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9239 break;
9240 case PyUnicode_2BYTE_KIND:
9241 out = ucs2lib_rsplit(
9242 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9243 break;
9244 case PyUnicode_4BYTE_KIND:
9245 out = ucs4lib_rsplit(
9246 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9247 break;
9248 default:
9249 out = NULL;
9250 }
9251 if (kind1 != kind)
9252 PyMem_Free(buf1);
9253 if (kind2 != kind)
9254 PyMem_Free(buf2);
9255 return out;
9256}
9257
9258static Py_ssize_t
9259anylib_find(int kind, void *buf1, Py_ssize_t len1,
9260 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9261{
9262 switch(kind) {
9263 case PyUnicode_1BYTE_KIND:
9264 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9265 case PyUnicode_2BYTE_KIND:
9266 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9267 case PyUnicode_4BYTE_KIND:
9268 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9269 }
9270 assert(0);
9271 return -1;
9272}
9273
9274static Py_ssize_t
9275anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9276 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9277{
9278 switch(kind) {
9279 case PyUnicode_1BYTE_KIND:
9280 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9281 case PyUnicode_2BYTE_KIND:
9282 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9283 case PyUnicode_4BYTE_KIND:
9284 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9285 }
9286 assert(0);
9287 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009288}
9289
Alexander Belopolsky40018472011-02-26 01:02:56 +00009290static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291replace(PyObject *self, PyObject *str1,
9292 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 PyObject *u;
9295 char *sbuf = PyUnicode_DATA(self);
9296 char *buf1 = PyUnicode_DATA(str1);
9297 char *buf2 = PyUnicode_DATA(str2);
9298 int srelease = 0, release1 = 0, release2 = 0;
9299 int skind = PyUnicode_KIND(self);
9300 int kind1 = PyUnicode_KIND(str1);
9301 int kind2 = PyUnicode_KIND(str2);
9302 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9303 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9304 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305
9306 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009309 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 if (skind < kind1)
9312 /* substring too wide to be present */
9313 goto nothing;
9314
9315 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009316 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009317 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009319 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009321 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 Py_UCS4 u1, u2, maxchar;
9323 int mayshrink, rkind;
9324 u1 = PyUnicode_READ_CHAR(str1, 0);
9325 if (!findchar(sbuf, PyUnicode_KIND(self),
9326 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009327 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 u2 = PyUnicode_READ_CHAR(str2, 0);
9329 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9330 /* Replacing u1 with u2 may cause a maxchar reduction in the
9331 result string. */
9332 mayshrink = maxchar > 127;
9333 if (u2 > maxchar) {
9334 maxchar = u2;
9335 mayshrink = 0;
9336 }
9337 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009338 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009340 if (PyUnicode_CopyCharacters(u, 0,
9341 (PyObject*)self, 0, slen) < 0)
9342 {
9343 Py_DECREF(u);
9344 return NULL;
9345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 rkind = PyUnicode_KIND(u);
9347 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9348 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009349 if (--maxcount < 0)
9350 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 if (mayshrink) {
9354 PyObject *tmp = u;
9355 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9356 PyUnicode_GET_LENGTH(tmp));
9357 Py_DECREF(tmp);
9358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 int rkind = skind;
9361 char *res;
9362 if (kind1 < rkind) {
9363 /* widen substring */
9364 buf1 = _PyUnicode_AsKind(str1, rkind);
9365 if (!buf1) goto error;
9366 release1 = 1;
9367 }
9368 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009369 if (i < 0)
9370 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (rkind > kind2) {
9372 /* widen replacement */
9373 buf2 = _PyUnicode_AsKind(str2, rkind);
9374 if (!buf2) goto error;
9375 release2 = 1;
9376 }
9377 else if (rkind < kind2) {
9378 /* widen self and buf1 */
9379 rkind = kind2;
9380 if (release1) PyMem_Free(buf1);
9381 sbuf = _PyUnicode_AsKind(self, rkind);
9382 if (!sbuf) goto error;
9383 srelease = 1;
9384 buf1 = _PyUnicode_AsKind(str1, rkind);
9385 if (!buf1) goto error;
9386 release1 = 1;
9387 }
9388 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9389 if (!res) {
9390 PyErr_NoMemory();
9391 goto error;
9392 }
9393 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009394 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9396 buf2,
9397 PyUnicode_KIND_SIZE(rkind, len2));
9398 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009399
9400 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9402 slen-i,
9403 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009404 if (i == -1)
9405 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9407 buf2,
9408 PyUnicode_KIND_SIZE(rkind, len2));
9409 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411
9412 u = PyUnicode_FromKindAndData(rkind, res, slen);
9413 PyMem_Free(res);
9414 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 Py_ssize_t n, i, j, ires;
9419 Py_ssize_t product, new_size;
9420 int rkind = skind;
9421 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 if (kind1 < rkind) {
9424 buf1 = _PyUnicode_AsKind(str1, rkind);
9425 if (!buf1) goto error;
9426 release1 = 1;
9427 }
9428 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429 if (n == 0)
9430 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 if (kind2 < rkind) {
9432 buf2 = _PyUnicode_AsKind(str2, rkind);
9433 if (!buf2) goto error;
9434 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 else if (kind2 > rkind) {
9437 rkind = kind2;
9438 sbuf = _PyUnicode_AsKind(self, rkind);
9439 if (!sbuf) goto error;
9440 srelease = 1;
9441 if (release1) PyMem_Free(buf1);
9442 buf1 = _PyUnicode_AsKind(str1, rkind);
9443 if (!buf1) goto error;
9444 release1 = 1;
9445 }
9446 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9447 PyUnicode_GET_LENGTH(str1))); */
9448 product = n * (len2-len1);
9449 if ((product / (len2-len1)) != n) {
9450 PyErr_SetString(PyExc_OverflowError,
9451 "replace string is too long");
9452 goto error;
9453 }
9454 new_size = slen + product;
9455 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9456 PyErr_SetString(PyExc_OverflowError,
9457 "replace string is too long");
9458 goto error;
9459 }
9460 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9461 if (!res)
9462 goto error;
9463 ires = i = 0;
9464 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465 while (n-- > 0) {
9466 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 j = anylib_find(rkind,
9468 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9469 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009470 if (j == -1)
9471 break;
9472 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009473 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9475 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9476 PyUnicode_KIND_SIZE(rkind, j-i));
9477 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 }
9479 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 if (len2 > 0) {
9481 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9482 buf2,
9483 PyUnicode_KIND_SIZE(rkind, len2));
9484 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009489 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9491 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9492 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009493 } else {
9494 /* interleave */
9495 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9497 buf2,
9498 PyUnicode_KIND_SIZE(rkind, len2));
9499 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009500 if (--n <= 0)
9501 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9503 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9504 PyUnicode_KIND_SIZE(rkind, 1));
9505 ires++;
9506 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9509 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9510 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009513 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 if (srelease)
9516 PyMem_FREE(sbuf);
9517 if (release1)
9518 PyMem_FREE(buf1);
9519 if (release2)
9520 PyMem_FREE(buf2);
9521 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009522
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 if (srelease)
9526 PyMem_FREE(sbuf);
9527 if (release1)
9528 PyMem_FREE(buf1);
9529 if (release2)
9530 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531 if (PyUnicode_CheckExact(self)) {
9532 Py_INCREF(self);
9533 return (PyObject *) self;
9534 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009535 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 error:
9537 if (srelease && sbuf)
9538 PyMem_FREE(sbuf);
9539 if (release1 && buf1)
9540 PyMem_FREE(buf1);
9541 if (release2 && buf2)
9542 PyMem_FREE(buf2);
9543 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544}
9545
9546/* --- Unicode Object Methods --------------------------------------------- */
9547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009548PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550\n\
9551Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009552characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553
9554static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009555unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 return fixup(self, fixtitle);
9558}
9559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009560PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562\n\
9563Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009564have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565
9566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009567unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 return fixup(self, fixcapitalize);
9570}
9571
9572#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009573PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575\n\
9576Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009577normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
9579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009580unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
9582 PyObject *list;
9583 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009584 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 /* Split into words */
9587 list = split(self, NULL, -1);
9588 if (!list)
9589 return NULL;
9590
9591 /* Capitalize each word */
9592 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9593 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 if (item == NULL)
9596 goto onError;
9597 Py_DECREF(PyList_GET_ITEM(list, i));
9598 PyList_SET_ITEM(list, i, item);
9599 }
9600
9601 /* Join the words to form a new string */
9602 item = PyUnicode_Join(NULL, list);
9603
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 Py_DECREF(list);
9606 return (PyObject *)item;
9607}
9608#endif
9609
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009610/* Argument converter. Coerces to a single unicode character */
9611
9612static int
9613convert_uc(PyObject *obj, void *addr)
9614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009616 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009617
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 uniobj = PyUnicode_FromObject(obj);
9619 if (uniobj == NULL) {
9620 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 return 0;
9623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009625 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 Py_DECREF(uniobj);
9628 return 0;
9629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 Py_DECREF(uniobj);
9632 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009633}
9634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009635PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009638Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009639done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640
9641static PyObject *
9642unicode_center(PyUnicodeObject *self, PyObject *args)
9643{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009644 Py_ssize_t marg, left;
9645 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 Py_UCS4 fillchar = ' ';
9647
Victor Stinnere9a29352011-10-01 02:14:59 +02009648 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
Victor Stinnere9a29352011-10-01 02:14:59 +02009651 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 return NULL;
9653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 Py_INCREF(self);
9656 return (PyObject*) self;
9657 }
9658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 left = marg / 2 + (marg & width & 1);
9661
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009662 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
Marc-André Lemburge5034372000-08-08 08:04:29 +00009665#if 0
9666
9667/* This code should go into some future Unicode collation support
9668 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009669 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009670
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009671/* speedy UTF-16 code point order comparison */
9672/* gleaned from: */
9673/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9674
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009675static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009676{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009677 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009678 0, 0, 0, 0, 0, 0, 0, 0,
9679 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009680 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009681};
9682
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683static int
9684unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009686 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 Py_UNICODE *s1 = str1->str;
9689 Py_UNICODE *s2 = str2->str;
9690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 len1 = str1->_base._base.length;
9692 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009693
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009695 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009696
9697 c1 = *s1++;
9698 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009699
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (c1 > (1<<11) * 26)
9701 c1 += utf16Fixup[c1>>11];
9702 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009703 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009704 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009705
9706 if (c1 != c2)
9707 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009708
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009709 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
9711
9712 return (len1 < len2) ? -1 : (len1 != len2);
9713}
9714
Marc-André Lemburge5034372000-08-08 08:04:29 +00009715#else
9716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717/* This function assumes that str1 and str2 are readied by the caller. */
9718
Marc-André Lemburge5034372000-08-08 08:04:29 +00009719static int
9720unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 int kind1, kind2;
9723 void *data1, *data2;
9724 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 kind1 = PyUnicode_KIND(str1);
9727 kind2 = PyUnicode_KIND(str2);
9728 data1 = PyUnicode_DATA(str1);
9729 data2 = PyUnicode_DATA(str2);
9730 len1 = PyUnicode_GET_LENGTH(str1);
9731 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 for (i = 0; i < len1 && i < len2; ++i) {
9734 Py_UCS4 c1, c2;
9735 c1 = PyUnicode_READ(kind1, data1, i);
9736 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009737
9738 if (c1 != c2)
9739 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009740 }
9741
9742 return (len1 < len2) ? -1 : (len1 != len2);
9743}
9744
9745#endif
9746
Alexander Belopolsky40018472011-02-26 01:02:56 +00009747int
9748PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9751 if (PyUnicode_READY(left) == -1 ||
9752 PyUnicode_READY(right) == -1)
9753 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009754 return unicode_compare((PyUnicodeObject *)left,
9755 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009757 PyErr_Format(PyExc_TypeError,
9758 "Can't compare %.100s and %.100s",
9759 left->ob_type->tp_name,
9760 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 return -1;
9762}
9763
Martin v. Löwis5b222132007-06-10 09:51:05 +00009764int
9765PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 Py_ssize_t i;
9768 int kind;
9769 void *data;
9770 Py_UCS4 chr;
9771
Victor Stinner910337b2011-10-03 03:20:16 +02009772 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 if (PyUnicode_READY(uni) == -1)
9774 return -1;
9775 kind = PyUnicode_KIND(uni);
9776 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009777 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9779 if (chr != str[i])
9780 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009781 /* This check keeps Python strings that end in '\0' from comparing equal
9782 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009785 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009787 return 0;
9788}
9789
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009790
Benjamin Peterson29060642009-01-31 22:14:21 +00009791#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009793
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794PyObject *
9795PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009796{
9797 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009798
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009799 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9800 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 if (PyUnicode_READY(left) == -1 ||
9802 PyUnicode_READY(right) == -1)
9803 return NULL;
9804 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9805 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009806 if (op == Py_EQ) {
9807 Py_INCREF(Py_False);
9808 return Py_False;
9809 }
9810 if (op == Py_NE) {
9811 Py_INCREF(Py_True);
9812 return Py_True;
9813 }
9814 }
9815 if (left == right)
9816 result = 0;
9817 else
9818 result = unicode_compare((PyUnicodeObject *)left,
9819 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009820
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009821 /* Convert the return value to a Boolean */
9822 switch (op) {
9823 case Py_EQ:
9824 v = TEST_COND(result == 0);
9825 break;
9826 case Py_NE:
9827 v = TEST_COND(result != 0);
9828 break;
9829 case Py_LE:
9830 v = TEST_COND(result <= 0);
9831 break;
9832 case Py_GE:
9833 v = TEST_COND(result >= 0);
9834 break;
9835 case Py_LT:
9836 v = TEST_COND(result == -1);
9837 break;
9838 case Py_GT:
9839 v = TEST_COND(result == 1);
9840 break;
9841 default:
9842 PyErr_BadArgument();
9843 return NULL;
9844 }
9845 Py_INCREF(v);
9846 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848
Brian Curtindfc80e32011-08-10 20:28:54 -05009849 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009850}
9851
Alexander Belopolsky40018472011-02-26 01:02:56 +00009852int
9853PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009854{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 int kind1, kind2, kind;
9857 void *buf1, *buf2;
9858 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009859 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009860
9861 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 sub = PyUnicode_FromObject(element);
9863 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 PyErr_Format(PyExc_TypeError,
9865 "'in <string>' requires string as left operand, not %s",
9866 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009867 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (PyUnicode_READY(sub) == -1)
9870 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009871
Thomas Wouters477c8d52006-05-27 19:21:47 +00009872 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009873 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009874 Py_DECREF(sub);
9875 return -1;
9876 }
9877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 kind1 = PyUnicode_KIND(str);
9879 kind2 = PyUnicode_KIND(sub);
9880 kind = kind1 > kind2 ? kind1 : kind2;
9881 buf1 = PyUnicode_DATA(str);
9882 buf2 = PyUnicode_DATA(sub);
9883 if (kind1 != kind)
9884 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9885 if (!buf1) {
9886 Py_DECREF(sub);
9887 return -1;
9888 }
9889 if (kind2 != kind)
9890 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9891 if (!buf2) {
9892 Py_DECREF(sub);
9893 if (kind1 != kind) PyMem_Free(buf1);
9894 return -1;
9895 }
9896 len1 = PyUnicode_GET_LENGTH(str);
9897 len2 = PyUnicode_GET_LENGTH(sub);
9898
9899 switch(kind) {
9900 case PyUnicode_1BYTE_KIND:
9901 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9902 break;
9903 case PyUnicode_2BYTE_KIND:
9904 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9905 break;
9906 case PyUnicode_4BYTE_KIND:
9907 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9908 break;
9909 default:
9910 result = -1;
9911 assert(0);
9912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009913
9914 Py_DECREF(str);
9915 Py_DECREF(sub);
9916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (kind1 != kind)
9918 PyMem_Free(buf1);
9919 if (kind2 != kind)
9920 PyMem_Free(buf2);
9921
Guido van Rossum403d68b2000-03-13 15:55:09 +00009922 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009923}
9924
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925/* Concat to string or Unicode object giving a new Unicode object. */
9926
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927PyObject *
9928PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 PyObject *u = NULL, *v = NULL, *w;
9931 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932
9933 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
9941 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009942 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009946 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 }
9950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009952 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 w = PyUnicode_New(
9956 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9957 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009960 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9961 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009962 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009963 v, 0,
9964 PyUnicode_GET_LENGTH(v)) < 0)
9965 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 Py_DECREF(u);
9967 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Benjamin Peterson29060642009-01-31 22:14:21 +00009970 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971 Py_XDECREF(u);
9972 Py_XDECREF(v);
9973 return NULL;
9974}
9975
Victor Stinnerb0923652011-10-04 01:17:31 +02009976static void
9977unicode_append_inplace(PyObject **p_left, PyObject *right)
9978{
9979 Py_ssize_t left_len, right_len, new_len;
9980#ifdef Py_DEBUG
9981 Py_ssize_t copied;
9982#endif
9983
9984 assert(PyUnicode_IS_READY(*p_left));
9985 assert(PyUnicode_IS_READY(right));
9986
9987 left_len = PyUnicode_GET_LENGTH(*p_left);
9988 right_len = PyUnicode_GET_LENGTH(right);
9989 if (left_len > PY_SSIZE_T_MAX - right_len) {
9990 PyErr_SetString(PyExc_OverflowError,
9991 "strings are too large to concat");
9992 goto error;
9993 }
9994 new_len = left_len + right_len;
9995
9996 /* Now we own the last reference to 'left', so we can resize it
9997 * in-place.
9998 */
9999 if (unicode_resize(p_left, new_len) != 0) {
10000 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10001 * deallocated so it cannot be put back into
10002 * 'variable'. The MemoryError is raised when there
10003 * is no value in 'variable', which might (very
10004 * remotely) be a cause of incompatibilities.
10005 */
10006 goto error;
10007 }
10008 /* copy 'right' into the newly allocated area of 'left' */
10009#ifdef Py_DEBUG
10010 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10011 right, 0,
10012 right_len);
10013 assert(0 <= copied);
10014#else
10015 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10016#endif
10017 return;
10018
10019error:
10020 Py_DECREF(*p_left);
10021 *p_left = NULL;
10022}
10023
Walter Dörwald1ab83302007-05-18 17:15:44 +000010024void
Victor Stinner23e56682011-10-03 03:54:37 +020010025PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010026{
Victor Stinner23e56682011-10-03 03:54:37 +020010027 PyObject *left, *res;
10028
10029 if (p_left == NULL) {
10030 if (!PyErr_Occurred())
10031 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010032 return;
10033 }
Victor Stinner23e56682011-10-03 03:54:37 +020010034 left = *p_left;
10035 if (right == NULL || !PyUnicode_Check(left)) {
10036 if (!PyErr_Occurred())
10037 PyErr_BadInternalCall();
10038 goto error;
10039 }
10040
10041 if (PyUnicode_CheckExact(left) && left != unicode_empty
10042 && PyUnicode_CheckExact(right) && right != unicode_empty
10043 && unicode_resizable(left)
10044 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10045 || _PyUnicode_WSTR(left) != NULL))
10046 {
Victor Stinner23e56682011-10-03 03:54:37 +020010047 if (PyUnicode_READY(left))
10048 goto error;
10049 if (PyUnicode_READY(right))
10050 goto error;
10051
Victor Stinnerb0923652011-10-04 01:17:31 +020010052 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10053 to change the structure size, but characters are stored just after
10054 the structure, and so it requires to move all charactres which is
10055 not so different than duplicating the string. */
10056 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010057 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010058 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010059 return;
10060 }
10061 }
10062
10063 res = PyUnicode_Concat(left, right);
10064 if (res == NULL)
10065 goto error;
10066 Py_DECREF(left);
10067 *p_left = res;
10068 return;
10069
10070error:
10071 Py_DECREF(*p_left);
10072 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010073}
10074
10075void
10076PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010078 PyUnicode_Append(pleft, right);
10079 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010080}
10081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010082PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010086string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010087interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
10089static PyObject *
10090unicode_count(PyUnicodeObject *self, PyObject *args)
10091{
10092 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010093 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010094 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 int kind1, kind2, kind;
10097 void *buf1, *buf2;
10098 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099
Jesus Ceaac451502011-04-20 17:09:23 +020010100 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10101 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 kind1 = PyUnicode_KIND(self);
10105 kind2 = PyUnicode_KIND(substring);
10106 kind = kind1 > kind2 ? kind1 : kind2;
10107 buf1 = PyUnicode_DATA(self);
10108 buf2 = PyUnicode_DATA(substring);
10109 if (kind1 != kind)
10110 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10111 if (!buf1) {
10112 Py_DECREF(substring);
10113 return NULL;
10114 }
10115 if (kind2 != kind)
10116 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10117 if (!buf2) {
10118 Py_DECREF(substring);
10119 if (kind1 != kind) PyMem_Free(buf1);
10120 return NULL;
10121 }
10122 len1 = PyUnicode_GET_LENGTH(self);
10123 len2 = PyUnicode_GET_LENGTH(substring);
10124
10125 ADJUST_INDICES(start, end, len1);
10126 switch(kind) {
10127 case PyUnicode_1BYTE_KIND:
10128 iresult = ucs1lib_count(
10129 ((Py_UCS1*)buf1) + start, end - start,
10130 buf2, len2, PY_SSIZE_T_MAX
10131 );
10132 break;
10133 case PyUnicode_2BYTE_KIND:
10134 iresult = ucs2lib_count(
10135 ((Py_UCS2*)buf1) + start, end - start,
10136 buf2, len2, PY_SSIZE_T_MAX
10137 );
10138 break;
10139 case PyUnicode_4BYTE_KIND:
10140 iresult = ucs4lib_count(
10141 ((Py_UCS4*)buf1) + start, end - start,
10142 buf2, len2, PY_SSIZE_T_MAX
10143 );
10144 break;
10145 default:
10146 assert(0); iresult = 0;
10147 }
10148
10149 result = PyLong_FromSsize_t(iresult);
10150
10151 if (kind1 != kind)
10152 PyMem_Free(buf1);
10153 if (kind2 != kind)
10154 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
10156 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 return result;
10159}
10160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010161PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010162 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010164Encode S using the codec registered for encoding. Default encoding\n\
10165is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010166handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010167a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10168'xmlcharrefreplace' as well as any other name registered with\n\
10169codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
10171static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010172unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010174 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 char *encoding = NULL;
10176 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010177
Benjamin Peterson308d6372009-09-18 21:42:35 +000010178 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10179 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010181 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010182}
10183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010184PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186\n\
10187Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010188If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
10190static PyObject*
10191unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10192{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010193 Py_ssize_t i, j, line_pos, src_len, incr;
10194 Py_UCS4 ch;
10195 PyObject *u;
10196 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010198 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010199 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
10201 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Antoine Pitrou22425222011-10-04 19:10:51 +020010204 if (PyUnicode_READY(self) == -1)
10205 return NULL;
10206
Thomas Wouters7e474022000-07-16 12:04:32 +000010207 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010208 src_len = PyUnicode_GET_LENGTH(self);
10209 i = j = line_pos = 0;
10210 kind = PyUnicode_KIND(self);
10211 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010212 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010213 for (; i < src_len; i++) {
10214 ch = PyUnicode_READ(kind, src_data, i);
10215 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010216 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010218 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010219 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010220 goto overflow;
10221 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010223 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010226 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010227 goto overflow;
10228 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010230 if (ch == '\n' || ch == '\r')
10231 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010233 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010234 if (!found && PyUnicode_CheckExact(self)) {
10235 Py_INCREF((PyObject *) self);
10236 return (PyObject *) self;
10237 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010238
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010240 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 if (!u)
10242 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010243 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Antoine Pitroue71d5742011-10-04 15:55:09 +020010245 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Antoine Pitroue71d5742011-10-04 15:55:09 +020010247 for (; i < src_len; i++) {
10248 ch = PyUnicode_READ(kind, src_data, i);
10249 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010251 incr = tabsize - (line_pos % tabsize);
10252 line_pos += incr;
10253 while (incr--) {
10254 PyUnicode_WRITE(kind, dest_data, j, ' ');
10255 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010256 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010258 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010259 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010260 line_pos++;
10261 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010262 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010263 if (ch == '\n' || ch == '\r')
10264 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010266 }
10267 assert (j == PyUnicode_GET_LENGTH(u));
10268 if (PyUnicode_READY(u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 Py_DECREF(u);
10270 return NULL;
10271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010273
Antoine Pitroue71d5742011-10-04 15:55:09 +020010274 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010275 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010279PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281\n\
10282Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010283such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284arguments start and end are interpreted as in slice notation.\n\
10285\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010286Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
10288static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290{
Jesus Ceaac451502011-04-20 17:09:23 +020010291 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010292 Py_ssize_t start;
10293 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
Jesus Ceaac451502011-04-20 17:09:23 +020010296 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10297 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (PyUnicode_READY(self) == -1)
10301 return NULL;
10302 if (PyUnicode_READY(substring) == -1)
10303 return NULL;
10304
10305 result = any_find_slice(
10306 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10307 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010308 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (result == -2)
10313 return NULL;
10314
Christian Heimes217cfd12007-12-02 14:31:20 +000010315 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316}
10317
10318static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010319unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010321 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10322 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325}
10326
Guido van Rossumc2504932007-09-18 19:42:40 +000010327/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010328 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010329static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010330unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
Guido van Rossumc2504932007-09-18 19:42:40 +000010332 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010333 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (_PyUnicode_HASH(self) != -1)
10336 return _PyUnicode_HASH(self);
10337 if (PyUnicode_READY(self) == -1)
10338 return -1;
10339 len = PyUnicode_GET_LENGTH(self);
10340
10341 /* The hash function as a macro, gets expanded three times below. */
10342#define HASH(P) \
10343 x = (Py_uhash_t)*P << 7; \
10344 while (--len >= 0) \
10345 x = (1000003*x) ^ (Py_uhash_t)*P++;
10346
10347 switch (PyUnicode_KIND(self)) {
10348 case PyUnicode_1BYTE_KIND: {
10349 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10350 HASH(c);
10351 break;
10352 }
10353 case PyUnicode_2BYTE_KIND: {
10354 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10355 HASH(s);
10356 break;
10357 }
10358 default: {
10359 Py_UCS4 *l;
10360 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10361 "Impossible switch case in unicode_hash");
10362 l = PyUnicode_4BYTE_DATA(self);
10363 HASH(l);
10364 break;
10365 }
10366 }
10367 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10368
Guido van Rossumc2504932007-09-18 19:42:40 +000010369 if (x == -1)
10370 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010372 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010376PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010379Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
10381static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010384 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010385 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010386 Py_ssize_t start;
10387 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Jesus Ceaac451502011-04-20 17:09:23 +020010389 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10390 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 if (PyUnicode_READY(self) == -1)
10394 return NULL;
10395 if (PyUnicode_READY(substring) == -1)
10396 return NULL;
10397
10398 result = any_find_slice(
10399 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10400 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010401 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
10403 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 if (result == -2)
10406 return NULL;
10407
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408 if (result < 0) {
10409 PyErr_SetString(PyExc_ValueError, "substring not found");
10410 return NULL;
10411 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010412
Christian Heimes217cfd12007-12-02 14:31:20 +000010413 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414}
10415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010416PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010419Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010420at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
10422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010423unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 Py_ssize_t i, length;
10426 int kind;
10427 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428 int cased;
10429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (PyUnicode_READY(self) == -1)
10431 return NULL;
10432 length = PyUnicode_GET_LENGTH(self);
10433 kind = PyUnicode_KIND(self);
10434 data = PyUnicode_DATA(self);
10435
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (length == 1)
10438 return PyBool_FromLong(
10439 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010441 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010444
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 for (i = 0; i < length; i++) {
10447 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010448
Benjamin Peterson29060642009-01-31 22:14:21 +000010449 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10450 return PyBool_FromLong(0);
10451 else if (!cased && Py_UNICODE_ISLOWER(ch))
10452 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010454 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455}
10456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010457PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010458 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010460Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010461at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462
10463static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010464unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 Py_ssize_t i, length;
10467 int kind;
10468 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469 int cased;
10470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (PyUnicode_READY(self) == -1)
10472 return NULL;
10473 length = PyUnicode_GET_LENGTH(self);
10474 kind = PyUnicode_KIND(self);
10475 data = PyUnicode_DATA(self);
10476
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (length == 1)
10479 return PyBool_FromLong(
10480 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010485
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 for (i = 0; i < length; i++) {
10488 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010489
Benjamin Peterson29060642009-01-31 22:14:21 +000010490 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10491 return PyBool_FromLong(0);
10492 else if (!cased && Py_UNICODE_ISUPPER(ch))
10493 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010495 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496}
10497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010498PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010501Return True if S is a titlecased string and there is at least one\n\
10502character in S, i.e. upper- and titlecase characters may only\n\
10503follow uncased characters and lowercase characters only cased ones.\n\
10504Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
10506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010507unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 Py_ssize_t i, length;
10510 int kind;
10511 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512 int cased, previous_is_cased;
10513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (PyUnicode_READY(self) == -1)
10515 return NULL;
10516 length = PyUnicode_GET_LENGTH(self);
10517 kind = PyUnicode_KIND(self);
10518 data = PyUnicode_DATA(self);
10519
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (length == 1) {
10522 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10523 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10524 (Py_UNICODE_ISUPPER(ch) != 0));
10525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010527 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010529 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010530
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 cased = 0;
10532 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 for (i = 0; i < length; i++) {
10534 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010535
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10537 if (previous_is_cased)
10538 return PyBool_FromLong(0);
10539 previous_is_cased = 1;
10540 cased = 1;
10541 }
10542 else if (Py_UNICODE_ISLOWER(ch)) {
10543 if (!previous_is_cased)
10544 return PyBool_FromLong(0);
10545 previous_is_cased = 1;
10546 cased = 1;
10547 }
10548 else
10549 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010551 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552}
10553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010554PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010557Return True if all characters in S are whitespace\n\
10558and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559
10560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010561unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 Py_ssize_t i, length;
10564 int kind;
10565 void *data;
10566
10567 if (PyUnicode_READY(self) == -1)
10568 return NULL;
10569 length = PyUnicode_GET_LENGTH(self);
10570 kind = PyUnicode_KIND(self);
10571 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 if (length == 1)
10575 return PyBool_FromLong(
10576 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010578 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 for (i = 0; i < length; i++) {
10583 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010584 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010587 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588}
10589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010592\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010593Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010594and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010595
10596static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010597unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 Py_ssize_t i, length;
10600 int kind;
10601 void *data;
10602
10603 if (PyUnicode_READY(self) == -1)
10604 return NULL;
10605 length = PyUnicode_GET_LENGTH(self);
10606 kind = PyUnicode_KIND(self);
10607 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010608
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010609 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (length == 1)
10611 return PyBool_FromLong(
10612 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010613
10614 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 for (i = 0; i < length; i++) {
10619 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010621 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010622 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010623}
10624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010625PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010627\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010628Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010630
10631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010632unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 int kind;
10635 void *data;
10636 Py_ssize_t len, i;
10637
10638 if (PyUnicode_READY(self) == -1)
10639 return NULL;
10640
10641 kind = PyUnicode_KIND(self);
10642 data = PyUnicode_DATA(self);
10643 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010644
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010645 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (len == 1) {
10647 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10648 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10649 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010650
10651 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 for (i = 0; i < len; i++) {
10656 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010657 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010659 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010660 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010661}
10662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010663PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010666Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010667False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
10669static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010670unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 Py_ssize_t i, length;
10673 int kind;
10674 void *data;
10675
10676 if (PyUnicode_READY(self) == -1)
10677 return NULL;
10678 length = PyUnicode_GET_LENGTH(self);
10679 kind = PyUnicode_KIND(self);
10680 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (length == 1)
10684 return PyBool_FromLong(
10685 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010687 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 for (i = 0; i < length; i++) {
10692 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010695 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696}
10697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010698PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010701Return True if all characters in S are digits\n\
10702and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703
10704static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010705unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 Py_ssize_t i, length;
10708 int kind;
10709 void *data;
10710
10711 if (PyUnicode_READY(self) == -1)
10712 return NULL;
10713 length = PyUnicode_GET_LENGTH(self);
10714 kind = PyUnicode_KIND(self);
10715 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (length == 1) {
10719 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10720 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010723 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 for (i = 0; i < length; i++) {
10728 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010731 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732}
10733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010734PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010737Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010738False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010741unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 Py_ssize_t i, length;
10744 int kind;
10745 void *data;
10746
10747 if (PyUnicode_READY(self) == -1)
10748 return NULL;
10749 length = PyUnicode_GET_LENGTH(self);
10750 kind = PyUnicode_KIND(self);
10751 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (length == 1)
10755 return PyBool_FromLong(
10756 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010758 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 for (i = 0; i < length; i++) {
10763 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010766 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767}
10768
Martin v. Löwis47383402007-08-15 07:32:56 +000010769int
10770PyUnicode_IsIdentifier(PyObject *self)
10771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 int kind;
10773 void *data;
10774 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010775 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (PyUnicode_READY(self) == -1) {
10778 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 }
10781
10782 /* Special case for empty strings */
10783 if (PyUnicode_GET_LENGTH(self) == 0)
10784 return 0;
10785 kind = PyUnicode_KIND(self);
10786 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010787
10788 /* PEP 3131 says that the first character must be in
10789 XID_Start and subsequent characters in XID_Continue,
10790 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010792 letters, digits, underscore). However, given the current
10793 definition of XID_Start and XID_Continue, it is sufficient
10794 to check just for these, except that _ must be allowed
10795 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010797 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010798 return 0;
10799
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010800 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010802 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010803 return 1;
10804}
10805
10806PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010807 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010808\n\
10809Return True if S is a valid identifier according\n\
10810to the language definition.");
10811
10812static PyObject*
10813unicode_isidentifier(PyObject *self)
10814{
10815 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10816}
10817
Georg Brandl559e5d72008-06-11 18:37:52 +000010818PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010820\n\
10821Return True if all characters in S are considered\n\
10822printable in repr() or S is empty, False otherwise.");
10823
10824static PyObject*
10825unicode_isprintable(PyObject *self)
10826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 Py_ssize_t i, length;
10828 int kind;
10829 void *data;
10830
10831 if (PyUnicode_READY(self) == -1)
10832 return NULL;
10833 length = PyUnicode_GET_LENGTH(self);
10834 kind = PyUnicode_KIND(self);
10835 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010836
10837 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (length == 1)
10839 return PyBool_FromLong(
10840 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 for (i = 0; i < length; i++) {
10843 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010844 Py_RETURN_FALSE;
10845 }
10846 }
10847 Py_RETURN_TRUE;
10848}
10849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010850PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010851 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852\n\
10853Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010854iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010857unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010859 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860}
10861
Martin v. Löwis18e16552006-02-15 17:27:45 +000010862static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863unicode_length(PyUnicodeObject *self)
10864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 if (PyUnicode_READY(self) == -1)
10866 return -1;
10867 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868}
10869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010873Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010874done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
10876static PyObject *
10877unicode_ljust(PyUnicodeObject *self, PyObject *args)
10878{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010879 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 Py_UCS4 fillchar = ' ';
10881
10882 if (PyUnicode_READY(self) == -1)
10883 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010884
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010885 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 return NULL;
10887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 Py_INCREF(self);
10890 return (PyObject*) self;
10891 }
10892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894}
10895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010896PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900
10901static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010902unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 return fixup(self, fixlower);
10905}
10906
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010907#define LEFTSTRIP 0
10908#define RIGHTSTRIP 1
10909#define BOTHSTRIP 2
10910
10911/* Arrays indexed by above */
10912static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10913
10914#define STRIPNAME(i) (stripformat[i]+3)
10915
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010916/* externally visible for str.strip(unicode) */
10917PyObject *
10918_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 void *data;
10921 int kind;
10922 Py_ssize_t i, j, len;
10923 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10926 return NULL;
10927
10928 kind = PyUnicode_KIND(self);
10929 data = PyUnicode_DATA(self);
10930 len = PyUnicode_GET_LENGTH(self);
10931 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10932 PyUnicode_DATA(sepobj),
10933 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934
Benjamin Peterson14339b62009-01-31 16:36:08 +000010935 i = 0;
10936 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 while (i < len &&
10938 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 i++;
10940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 j = len;
10944 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 do {
10946 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 } while (j >= i &&
10948 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010950 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010951
Victor Stinner12bab6d2011-10-01 01:53:49 +020010952 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953}
10954
10955PyObject*
10956PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10957{
10958 unsigned char *data;
10959 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010960 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961
Victor Stinnerde636f32011-10-01 03:55:54 +020010962 if (PyUnicode_READY(self) == -1)
10963 return NULL;
10964
10965 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10966
Victor Stinner12bab6d2011-10-01 01:53:49 +020010967 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010969 if (PyUnicode_CheckExact(self)) {
10970 Py_INCREF(self);
10971 return self;
10972 }
10973 else
10974 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 }
10976
Victor Stinner12bab6d2011-10-01 01:53:49 +020010977 length = end - start;
10978 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010979 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980
Victor Stinnerde636f32011-10-01 03:55:54 +020010981 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010982 PyErr_SetString(PyExc_IndexError, "string index out of range");
10983 return NULL;
10984 }
10985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 kind = PyUnicode_KIND(self);
10987 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010988 return PyUnicode_FromKindAndData(kind,
10989 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010990 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
10993static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010994do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 int kind;
10997 void *data;
10998 Py_ssize_t len, i, j;
10999
11000 if (PyUnicode_READY(self) == -1)
11001 return NULL;
11002
11003 kind = PyUnicode_KIND(self);
11004 data = PyUnicode_DATA(self);
11005 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011006
Benjamin Peterson14339b62009-01-31 16:36:08 +000011007 i = 0;
11008 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011010 i++;
11011 }
11012 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011013
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014 j = len;
11015 if (striptype != LEFTSTRIP) {
11016 do {
11017 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 j++;
11020 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011021
Victor Stinner12bab6d2011-10-01 01:53:49 +020011022 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023}
11024
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011025
11026static PyObject *
11027do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11028{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011029 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011030
Benjamin Peterson14339b62009-01-31 16:36:08 +000011031 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11032 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011033
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 if (sep != NULL && sep != Py_None) {
11035 if (PyUnicode_Check(sep))
11036 return _PyUnicode_XStrip(self, striptype, sep);
11037 else {
11038 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 "%s arg must be None or str",
11040 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011041 return NULL;
11042 }
11043 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011044
Benjamin Peterson14339b62009-01-31 16:36:08 +000011045 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011046}
11047
11048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011049PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011051\n\
11052Return a copy of the string S with leading and trailing\n\
11053whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011054If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011055
11056static PyObject *
11057unicode_strip(PyUnicodeObject *self, PyObject *args)
11058{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011059 if (PyTuple_GET_SIZE(args) == 0)
11060 return do_strip(self, BOTHSTRIP); /* Common case */
11061 else
11062 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011063}
11064
11065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011066PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011067 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011068\n\
11069Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011070If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011071
11072static PyObject *
11073unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11074{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011075 if (PyTuple_GET_SIZE(args) == 0)
11076 return do_strip(self, LEFTSTRIP); /* Common case */
11077 else
11078 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011079}
11080
11081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011082PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011084\n\
11085Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011086If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011087
11088static PyObject *
11089unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11090{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011091 if (PyTuple_GET_SIZE(args) == 0)
11092 return do_strip(self, RIGHTSTRIP); /* Common case */
11093 else
11094 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011095}
11096
11097
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011099unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100{
11101 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103
Georg Brandl222de0f2009-04-12 12:01:50 +000011104 if (len < 1) {
11105 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011106 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108
Tim Peters7a29bd52001-09-12 03:03:31 +000011109 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 /* no repeat, return original string */
11111 Py_INCREF(str);
11112 return (PyObject*) str;
11113 }
Tim Peters8f422462000-09-09 06:13:41 +000011114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 if (PyUnicode_READY(str) == -1)
11116 return NULL;
11117
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011118 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011119 PyErr_SetString(PyExc_OverflowError,
11120 "repeated string is too long");
11121 return NULL;
11122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126 if (!u)
11127 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011128 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 if (PyUnicode_GET_LENGTH(str) == 1) {
11131 const int kind = PyUnicode_KIND(str);
11132 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11133 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011134 if (kind == PyUnicode_1BYTE_KIND)
11135 memset(to, (unsigned char)fill_char, len);
11136 else {
11137 for (n = 0; n < len; ++n)
11138 PyUnicode_WRITE(kind, to, n, fill_char);
11139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 }
11141 else {
11142 /* number of characters copied this far */
11143 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11144 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11145 char *to = (char *) PyUnicode_DATA(u);
11146 Py_MEMCPY(to, PyUnicode_DATA(str),
11147 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 n = (done <= nchars-done) ? done : nchars-done;
11150 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011151 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 }
11154
11155 return (PyObject*) u;
11156}
11157
Alexander Belopolsky40018472011-02-26 01:02:56 +000011158PyObject *
11159PyUnicode_Replace(PyObject *obj,
11160 PyObject *subobj,
11161 PyObject *replobj,
11162 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163{
11164 PyObject *self;
11165 PyObject *str1;
11166 PyObject *str2;
11167 PyObject *result;
11168
11169 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011170 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011173 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 Py_DECREF(self);
11175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 }
11177 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011178 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 Py_DECREF(self);
11180 Py_DECREF(str1);
11181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 Py_DECREF(self);
11185 Py_DECREF(str1);
11186 Py_DECREF(str2);
11187 return result;
11188}
11189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011191 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192\n\
11193Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011194old replaced by new. If the optional argument count is\n\
11195given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 PyObject *str1;
11201 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011202 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 PyObject *result;
11204
Martin v. Löwis18e16552006-02-15 17:27:45 +000011205 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 str1 = PyUnicode_FromObject(str1);
11210 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11211 return NULL;
11212 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011213 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 Py_DECREF(str1);
11215 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217
11218 result = replace(self, str1, str2, maxcount);
11219
11220 Py_DECREF(str1);
11221 Py_DECREF(str2);
11222 return result;
11223}
11224
Alexander Belopolsky40018472011-02-26 01:02:56 +000011225static PyObject *
11226unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011228 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 Py_ssize_t isize;
11230 Py_ssize_t osize, squote, dquote, i, o;
11231 Py_UCS4 max, quote;
11232 int ikind, okind;
11233 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011236 return NULL;
11237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 isize = PyUnicode_GET_LENGTH(unicode);
11239 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 /* Compute length of output, quote characters, and
11242 maximum character */
11243 osize = 2; /* quotes */
11244 max = 127;
11245 squote = dquote = 0;
11246 ikind = PyUnicode_KIND(unicode);
11247 for (i = 0; i < isize; i++) {
11248 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11249 switch (ch) {
11250 case '\'': squote++; osize++; break;
11251 case '"': dquote++; osize++; break;
11252 case '\\': case '\t': case '\r': case '\n':
11253 osize += 2; break;
11254 default:
11255 /* Fast-path ASCII */
11256 if (ch < ' ' || ch == 0x7f)
11257 osize += 4; /* \xHH */
11258 else if (ch < 0x7f)
11259 osize++;
11260 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11261 osize++;
11262 max = ch > max ? ch : max;
11263 }
11264 else if (ch < 0x100)
11265 osize += 4; /* \xHH */
11266 else if (ch < 0x10000)
11267 osize += 6; /* \uHHHH */
11268 else
11269 osize += 10; /* \uHHHHHHHH */
11270 }
11271 }
11272
11273 quote = '\'';
11274 if (squote) {
11275 if (dquote)
11276 /* Both squote and dquote present. Use squote,
11277 and escape them */
11278 osize += squote;
11279 else
11280 quote = '"';
11281 }
11282
11283 repr = PyUnicode_New(osize, max);
11284 if (repr == NULL)
11285 return NULL;
11286 okind = PyUnicode_KIND(repr);
11287 odata = PyUnicode_DATA(repr);
11288
11289 PyUnicode_WRITE(okind, odata, 0, quote);
11290 PyUnicode_WRITE(okind, odata, osize-1, quote);
11291
11292 for (i = 0, o = 1; i < isize; i++) {
11293 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011294
11295 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if ((ch == quote) || (ch == '\\')) {
11297 PyUnicode_WRITE(okind, odata, o++, '\\');
11298 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011299 continue;
11300 }
11301
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011303 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 PyUnicode_WRITE(okind, odata, o++, '\\');
11305 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011306 }
11307 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 PyUnicode_WRITE(okind, odata, o++, '\\');
11309 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011310 }
11311 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 PyUnicode_WRITE(okind, odata, o++, '\\');
11313 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011314 }
11315
11316 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011317 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 PyUnicode_WRITE(okind, odata, o++, '\\');
11319 PyUnicode_WRITE(okind, odata, o++, 'x');
11320 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11321 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011322 }
11323
Georg Brandl559e5d72008-06-11 18:37:52 +000011324 /* Copy ASCII characters as-is */
11325 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011327 }
11328
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011330 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011331 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011332 (categories Z* and C* except ASCII space)
11333 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011335 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (ch <= 0xff) {
11337 PyUnicode_WRITE(okind, odata, o++, '\\');
11338 PyUnicode_WRITE(okind, odata, o++, 'x');
11339 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11340 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011341 }
11342 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 else if (ch >= 0x10000) {
11344 PyUnicode_WRITE(okind, odata, o++, '\\');
11345 PyUnicode_WRITE(okind, odata, o++, 'U');
11346 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11347 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11348 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11349 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11350 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11351 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11352 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11353 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011354 }
11355 /* Map 16-bit characters to '\uxxxx' */
11356 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 PyUnicode_WRITE(okind, odata, o++, '\\');
11358 PyUnicode_WRITE(okind, odata, o++, 'u');
11359 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11360 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11361 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11362 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011363 }
11364 }
11365 /* Copy characters as-is */
11366 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011368 }
11369 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011372 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373}
11374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011375PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377\n\
11378Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011379such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380arguments start and end are interpreted as in slice notation.\n\
11381\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011382Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383
11384static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386{
Jesus Ceaac451502011-04-20 17:09:23 +020011387 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011388 Py_ssize_t start;
11389 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011390 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
Jesus Ceaac451502011-04-20 17:09:23 +020011392 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11393 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 if (PyUnicode_READY(self) == -1)
11397 return NULL;
11398 if (PyUnicode_READY(substring) == -1)
11399 return NULL;
11400
11401 result = any_find_slice(
11402 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11403 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011404 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (result == -2)
11409 return NULL;
11410
Christian Heimes217cfd12007-12-02 14:31:20 +000011411 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Jesus Ceaac451502011-04-20 17:09:23 +020011422 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011423 Py_ssize_t start;
11424 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011425 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Jesus Ceaac451502011-04-20 17:09:23 +020011427 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11428 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (PyUnicode_READY(self) == -1)
11432 return NULL;
11433 if (PyUnicode_READY(substring) == -1)
11434 return NULL;
11435
11436 result = any_find_slice(
11437 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11438 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011439 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (result == -2)
11444 return NULL;
11445
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 if (result < 0) {
11447 PyErr_SetString(PyExc_ValueError, "substring not found");
11448 return NULL;
11449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450
Christian Heimes217cfd12007-12-02 14:31:20 +000011451 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452}
11453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011457Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011458done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
11460static PyObject *
11461unicode_rjust(PyUnicodeObject *self, PyObject *args)
11462{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011463 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 Py_UCS4 fillchar = ' ';
11465
Victor Stinnere9a29352011-10-01 02:14:59 +020011466 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011468
Victor Stinnere9a29352011-10-01 02:14:59 +020011469 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 return NULL;
11471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 Py_INCREF(self);
11474 return (PyObject*) self;
11475 }
11476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Alexander Belopolsky40018472011-02-26 01:02:56 +000011480PyObject *
11481PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
11483 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011484
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 s = PyUnicode_FromObject(s);
11486 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011487 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 if (sep != NULL) {
11489 sep = PyUnicode_FromObject(sep);
11490 if (sep == NULL) {
11491 Py_DECREF(s);
11492 return NULL;
11493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 }
11495
11496 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11497
11498 Py_DECREF(s);
11499 Py_XDECREF(sep);
11500 return result;
11501}
11502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505\n\
11506Return a list of the words in S, using sep as the\n\
11507delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011508splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011509whitespace string is a separator and empty strings are\n\
11510removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
11512static PyObject*
11513unicode_split(PyUnicodeObject *self, PyObject *args)
11514{
11515 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011516 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
Martin v. Löwis18e16552006-02-15 17:27:45 +000011518 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 return NULL;
11520
11521 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527}
11528
Thomas Wouters477c8d52006-05-27 19:21:47 +000011529PyObject *
11530PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11531{
11532 PyObject* str_obj;
11533 PyObject* sep_obj;
11534 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 int kind1, kind2, kind;
11536 void *buf1 = NULL, *buf2 = NULL;
11537 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011538
11539 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011540 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011542 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544 Py_DECREF(str_obj);
11545 return NULL;
11546 }
11547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 kind1 = PyUnicode_KIND(str_in);
11549 kind2 = PyUnicode_KIND(sep_obj);
11550 kind = kind1 > kind2 ? kind1 : kind2;
11551 buf1 = PyUnicode_DATA(str_in);
11552 if (kind1 != kind)
11553 buf1 = _PyUnicode_AsKind(str_in, kind);
11554 if (!buf1)
11555 goto onError;
11556 buf2 = PyUnicode_DATA(sep_obj);
11557 if (kind2 != kind)
11558 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11559 if (!buf2)
11560 goto onError;
11561 len1 = PyUnicode_GET_LENGTH(str_obj);
11562 len2 = PyUnicode_GET_LENGTH(sep_obj);
11563
11564 switch(PyUnicode_KIND(str_in)) {
11565 case PyUnicode_1BYTE_KIND:
11566 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11567 break;
11568 case PyUnicode_2BYTE_KIND:
11569 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11570 break;
11571 case PyUnicode_4BYTE_KIND:
11572 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11573 break;
11574 default:
11575 assert(0);
11576 out = 0;
11577 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011578
11579 Py_DECREF(sep_obj);
11580 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 if (kind1 != kind)
11582 PyMem_Free(buf1);
11583 if (kind2 != kind)
11584 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011585
11586 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 onError:
11588 Py_DECREF(sep_obj);
11589 Py_DECREF(str_obj);
11590 if (kind1 != kind && buf1)
11591 PyMem_Free(buf1);
11592 if (kind2 != kind && buf2)
11593 PyMem_Free(buf2);
11594 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011595}
11596
11597
11598PyObject *
11599PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11600{
11601 PyObject* str_obj;
11602 PyObject* sep_obj;
11603 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 int kind1, kind2, kind;
11605 void *buf1 = NULL, *buf2 = NULL;
11606 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011607
11608 str_obj = PyUnicode_FromObject(str_in);
11609 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611 sep_obj = PyUnicode_FromObject(sep_in);
11612 if (!sep_obj) {
11613 Py_DECREF(str_obj);
11614 return NULL;
11615 }
11616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 kind1 = PyUnicode_KIND(str_in);
11618 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011619 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 buf1 = PyUnicode_DATA(str_in);
11621 if (kind1 != kind)
11622 buf1 = _PyUnicode_AsKind(str_in, kind);
11623 if (!buf1)
11624 goto onError;
11625 buf2 = PyUnicode_DATA(sep_obj);
11626 if (kind2 != kind)
11627 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11628 if (!buf2)
11629 goto onError;
11630 len1 = PyUnicode_GET_LENGTH(str_obj);
11631 len2 = PyUnicode_GET_LENGTH(sep_obj);
11632
11633 switch(PyUnicode_KIND(str_in)) {
11634 case PyUnicode_1BYTE_KIND:
11635 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11636 break;
11637 case PyUnicode_2BYTE_KIND:
11638 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11639 break;
11640 case PyUnicode_4BYTE_KIND:
11641 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11642 break;
11643 default:
11644 assert(0);
11645 out = 0;
11646 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647
11648 Py_DECREF(sep_obj);
11649 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (kind1 != kind)
11651 PyMem_Free(buf1);
11652 if (kind2 != kind)
11653 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654
11655 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 onError:
11657 Py_DECREF(sep_obj);
11658 Py_DECREF(str_obj);
11659 if (kind1 != kind && buf1)
11660 PyMem_Free(buf1);
11661 if (kind2 != kind && buf2)
11662 PyMem_Free(buf2);
11663 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011664}
11665
11666PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011668\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011669Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011670the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011671found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011672
11673static PyObject*
11674unicode_partition(PyUnicodeObject *self, PyObject *separator)
11675{
11676 return PyUnicode_Partition((PyObject *)self, separator);
11677}
11678
11679PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011680 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011681\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011682Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011683the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011684separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011685
11686static PyObject*
11687unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11688{
11689 return PyUnicode_RPartition((PyObject *)self, separator);
11690}
11691
Alexander Belopolsky40018472011-02-26 01:02:56 +000011692PyObject *
11693PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011694{
11695 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011696
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011697 s = PyUnicode_FromObject(s);
11698 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011699 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 if (sep != NULL) {
11701 sep = PyUnicode_FromObject(sep);
11702 if (sep == NULL) {
11703 Py_DECREF(s);
11704 return NULL;
11705 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011706 }
11707
11708 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11709
11710 Py_DECREF(s);
11711 Py_XDECREF(sep);
11712 return result;
11713}
11714
11715PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011717\n\
11718Return a list of the words in S, using sep as the\n\
11719delimiter string, starting at the end of the string and\n\
11720working to the front. If maxsplit is given, at most maxsplit\n\
11721splits are done. If sep is not specified, any whitespace string\n\
11722is a separator.");
11723
11724static PyObject*
11725unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11726{
11727 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011728 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011729
Martin v. Löwis18e16552006-02-15 17:27:45 +000011730 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011731 return NULL;
11732
11733 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011735 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011737 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011739}
11740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011741PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743\n\
11744Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011745Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011746is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011749unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011751 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011752 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011754 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11755 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 return NULL;
11757
Guido van Rossum86662912000-04-11 15:38:46 +000011758 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
11761static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011762PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Walter Dörwald346737f2007-05-31 10:44:43 +000011764 if (PyUnicode_CheckExact(self)) {
11765 Py_INCREF(self);
11766 return self;
11767 } else
11768 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011769 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770}
11771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774\n\
11775Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
11778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011779unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 return fixup(self, fixswapcase);
11782}
11783
Georg Brandlceee0772007-11-27 23:48:05 +000011784PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011786\n\
11787Return a translation table usable for str.translate().\n\
11788If there is only one argument, it must be a dictionary mapping Unicode\n\
11789ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011790Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011791If there are two arguments, they must be strings of equal length, and\n\
11792in the resulting dictionary, each character in x will be mapped to the\n\
11793character at the same position in y. If there is a third argument, it\n\
11794must be a string, whose characters will be mapped to None in the result.");
11795
11796static PyObject*
11797unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11798{
11799 PyObject *x, *y = NULL, *z = NULL;
11800 PyObject *new = NULL, *key, *value;
11801 Py_ssize_t i = 0;
11802 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803
Georg Brandlceee0772007-11-27 23:48:05 +000011804 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11805 return NULL;
11806 new = PyDict_New();
11807 if (!new)
11808 return NULL;
11809 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 int x_kind, y_kind, z_kind;
11811 void *x_data, *y_data, *z_data;
11812
Georg Brandlceee0772007-11-27 23:48:05 +000011813 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011814 if (!PyUnicode_Check(x)) {
11815 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11816 "be a string if there is a second argument");
11817 goto err;
11818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011820 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11821 "arguments must have equal length");
11822 goto err;
11823 }
11824 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 x_kind = PyUnicode_KIND(x);
11826 y_kind = PyUnicode_KIND(y);
11827 x_data = PyUnicode_DATA(x);
11828 y_data = PyUnicode_DATA(y);
11829 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11830 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11831 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011832 if (!key || !value)
11833 goto err;
11834 res = PyDict_SetItem(new, key, value);
11835 Py_DECREF(key);
11836 Py_DECREF(value);
11837 if (res < 0)
11838 goto err;
11839 }
11840 /* create entries for deleting chars in z */
11841 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 z_kind = PyUnicode_KIND(z);
11843 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011844 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011846 if (!key)
11847 goto err;
11848 res = PyDict_SetItem(new, key, Py_None);
11849 Py_DECREF(key);
11850 if (res < 0)
11851 goto err;
11852 }
11853 }
11854 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 int kind;
11856 void *data;
11857
Georg Brandlceee0772007-11-27 23:48:05 +000011858 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011859 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011860 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11861 "to maketrans it must be a dict");
11862 goto err;
11863 }
11864 /* copy entries into the new dict, converting string keys to int keys */
11865 while (PyDict_Next(x, &i, &key, &value)) {
11866 if (PyUnicode_Check(key)) {
11867 /* convert string keys to integer keys */
11868 PyObject *newkey;
11869 if (PyUnicode_GET_SIZE(key) != 1) {
11870 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11871 "table must be of length 1");
11872 goto err;
11873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 kind = PyUnicode_KIND(key);
11875 data = PyUnicode_DATA(key);
11876 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011877 if (!newkey)
11878 goto err;
11879 res = PyDict_SetItem(new, newkey, value);
11880 Py_DECREF(newkey);
11881 if (res < 0)
11882 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011883 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011884 /* just keep integer keys */
11885 if (PyDict_SetItem(new, key, value) < 0)
11886 goto err;
11887 } else {
11888 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11889 "be strings or integers");
11890 goto err;
11891 }
11892 }
11893 }
11894 return new;
11895 err:
11896 Py_DECREF(new);
11897 return NULL;
11898}
11899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011900PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902\n\
11903Return a copy of the string S, where all characters have been mapped\n\
11904through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011905Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011906Unmapped characters are left untouched. Characters mapped to None\n\
11907are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
11909static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913}
11914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011915PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011918Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
11920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011921unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 return fixup(self, fixupper);
11924}
11925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011926PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011929Pad a numeric string S with zeros on the left, to fill a field\n\
11930of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
11932static PyObject *
11933unicode_zfill(PyUnicodeObject *self, PyObject *args)
11934{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011935 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011937 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 int kind;
11939 void *data;
11940 Py_UCS4 chr;
11941
11942 if (PyUnicode_READY(self) == -1)
11943 return NULL;
11944
Martin v. Löwis18e16552006-02-15 17:27:45 +000011945 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 return NULL;
11947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011949 if (PyUnicode_CheckExact(self)) {
11950 Py_INCREF(self);
11951 return (PyObject*) self;
11952 }
11953 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011954 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
11956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959 u = pad(self, fill, 0, '0');
11960
Walter Dörwald068325e2002-04-15 13:36:47 +000011961 if (u == NULL)
11962 return NULL;
11963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 kind = PyUnicode_KIND(u);
11965 data = PyUnicode_DATA(u);
11966 chr = PyUnicode_READ(kind, data, fill);
11967
11968 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 PyUnicode_WRITE(kind, data, 0, chr);
11971 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 }
11973
11974 return (PyObject*) u;
11975}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
11977#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011978static PyObject *
11979unicode__decimal2ascii(PyObject *self)
11980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011982}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983#endif
11984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011985PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011988Return True if S starts with the specified prefix, False otherwise.\n\
11989With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011990With optional end, stop comparing S at that position.\n\
11991prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
11993static PyObject *
11994unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011997 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011999 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012000 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012001 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
Jesus Ceaac451502011-04-20 17:09:23 +020012003 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012005 if (PyTuple_Check(subobj)) {
12006 Py_ssize_t i;
12007 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12008 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012010 if (substring == NULL)
12011 return NULL;
12012 result = tailmatch(self, substring, start, end, -1);
12013 Py_DECREF(substring);
12014 if (result) {
12015 Py_RETURN_TRUE;
12016 }
12017 }
12018 /* nothing matched */
12019 Py_RETURN_FALSE;
12020 }
12021 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012022 if (substring == NULL) {
12023 if (PyErr_ExceptionMatches(PyExc_TypeError))
12024 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12025 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012027 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012028 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012030 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031}
12032
12033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012034PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012037Return True if S ends with the specified suffix, False otherwise.\n\
12038With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012039With optional end, stop comparing S at that position.\n\
12040suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
12042static PyObject *
12043unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012046 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012048 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012049 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012050 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Jesus Ceaac451502011-04-20 17:09:23 +020012052 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012054 if (PyTuple_Check(subobj)) {
12055 Py_ssize_t i;
12056 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12057 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012059 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012061 result = tailmatch(self, substring, start, end, +1);
12062 Py_DECREF(substring);
12063 if (result) {
12064 Py_RETURN_TRUE;
12065 }
12066 }
12067 Py_RETURN_FALSE;
12068 }
12069 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012070 if (substring == NULL) {
12071 if (PyErr_ExceptionMatches(PyExc_TypeError))
12072 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12073 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012075 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012076 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012078 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012082
12083PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012085\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012086Return a formatted version of S, using substitutions from args and kwargs.\n\
12087The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012088
Eric Smith27bbca62010-11-04 17:06:58 +000012089PyDoc_STRVAR(format_map__doc__,
12090 "S.format_map(mapping) -> str\n\
12091\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012092Return a formatted version of S, using substitutions from mapping.\n\
12093The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012094
Eric Smith4a7d76d2008-05-30 18:10:19 +000012095static PyObject *
12096unicode__format__(PyObject* self, PyObject* args)
12097{
12098 PyObject *format_spec;
12099
12100 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12101 return NULL;
12102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12104 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012105}
12106
Eric Smith8c663262007-08-25 02:26:07 +000012107PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012109\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012110Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012111
12112static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012113unicode__sizeof__(PyUnicodeObject *v)
12114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 Py_ssize_t size;
12116
12117 /* If it's a compact object, account for base structure +
12118 character data. */
12119 if (PyUnicode_IS_COMPACT_ASCII(v))
12120 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12121 else if (PyUnicode_IS_COMPACT(v))
12122 size = sizeof(PyCompactUnicodeObject) +
12123 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12124 else {
12125 /* If it is a two-block object, account for base object, and
12126 for character block if present. */
12127 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012128 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 size += (PyUnicode_GET_LENGTH(v) + 1) *
12130 PyUnicode_CHARACTER_SIZE(v);
12131 }
12132 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012133 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012134 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012136 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012137 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138
12139 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012140}
12141
12142PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012144
12145static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012146unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012147{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012148 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (!copy)
12150 return NULL;
12151 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012152}
12153
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154static PyMethodDef unicode_methods[] = {
12155
12156 /* Order is according to common usage: often used methods should
12157 appear first, since lookup is done sequentially. */
12158
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012159 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012160 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12161 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012162 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012163 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12164 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12165 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12166 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12167 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12168 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12169 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012170 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012171 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12172 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12173 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012174 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012175 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12176 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12177 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012178 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012179 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012180 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012181 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012182 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12183 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12184 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12185 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12186 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12187 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12188 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12189 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12190 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12191 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12192 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12193 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12194 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12195 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012196 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012197 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012198 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012199 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012200 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012201 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012202 {"maketrans", (PyCFunction) unicode_maketrans,
12203 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012204 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012205#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012206 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207#endif
12208
12209#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012210 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012211 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212#endif
12213
Benjamin Peterson14339b62009-01-31 16:36:08 +000012214 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 {NULL, NULL}
12216};
12217
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012218static PyObject *
12219unicode_mod(PyObject *v, PyObject *w)
12220{
Brian Curtindfc80e32011-08-10 20:28:54 -050012221 if (!PyUnicode_Check(v))
12222 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012224}
12225
12226static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012227 0, /*nb_add*/
12228 0, /*nb_subtract*/
12229 0, /*nb_multiply*/
12230 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012231};
12232
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012234 (lenfunc) unicode_length, /* sq_length */
12235 PyUnicode_Concat, /* sq_concat */
12236 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12237 (ssizeargfunc) unicode_getitem, /* sq_item */
12238 0, /* sq_slice */
12239 0, /* sq_ass_item */
12240 0, /* sq_ass_slice */
12241 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242};
12243
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012244static PyObject*
12245unicode_subscript(PyUnicodeObject* self, PyObject* item)
12246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (PyUnicode_READY(self) == -1)
12248 return NULL;
12249
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012250 if (PyIndex_Check(item)) {
12251 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012252 if (i == -1 && PyErr_Occurred())
12253 return NULL;
12254 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012256 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012257 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012258 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012259 PyObject *result;
12260 void *src_data, *dest_data;
12261 int kind;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012265 return NULL;
12266 }
12267
12268 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 return PyUnicode_New(0, 0);
12270 } else if (start == 0 && step == 1 &&
12271 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012272 PyUnicode_CheckExact(self)) {
12273 Py_INCREF(self);
12274 return (PyObject *)self;
12275 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012276 return PyUnicode_Substring((PyObject*)self,
12277 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012278 }
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012279 /* General (less optimized) case */
12280 result = PyUnicode_New(slicelength, PyUnicode_MAX_CHAR_VALUE(self));
12281 if (result == NULL)
12282 return NULL;
12283 kind = PyUnicode_KIND(self);
12284 src_data = PyUnicode_DATA(self);
12285 dest_data = PyUnicode_DATA(result);
12286
12287 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12288 Py_UCS4 ch = PyUnicode_READ(kind, src_data, cur);
12289 PyUnicode_WRITE(kind, dest_data, i, ch);
12290 }
12291 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012292 } else {
12293 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12294 return NULL;
12295 }
12296}
12297
12298static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 (lenfunc)unicode_length, /* mp_length */
12300 (binaryfunc)unicode_subscript, /* mp_subscript */
12301 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012302};
12303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305/* Helpers for PyUnicode_Format() */
12306
12307static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012308getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012310 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 (*p_argidx)++;
12313 if (arglen < 0)
12314 return args;
12315 else
12316 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 }
12318 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 return NULL;
12321}
12322
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012323/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012325static PyObject *
12326formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012328 char *p;
12329 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012331
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 x = PyFloat_AsDouble(v);
12333 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012334 return NULL;
12335
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012338
Eric Smith0923d1d2009-04-16 20:16:10 +000012339 p = PyOS_double_to_string(x, type, prec,
12340 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012341 if (p == NULL)
12342 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012344 PyMem_Free(p);
12345 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346}
12347
Tim Peters38fd5b62000-09-21 05:43:11 +000012348static PyObject*
12349formatlong(PyObject *val, int flags, int prec, int type)
12350{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012351 char *buf;
12352 int len;
12353 PyObject *str; /* temporary string object. */
12354 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012355
Benjamin Peterson14339b62009-01-31 16:36:08 +000012356 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12357 if (!str)
12358 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012360 Py_DECREF(str);
12361 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012362}
12363
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012366 size_t buflen,
12367 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012369 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012370 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 if (PyUnicode_GET_LENGTH(v) == 1) {
12372 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 buf[1] = '\0';
12374 return 1;
12375 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 goto onError;
12377 }
12378 else {
12379 /* Integer input truncated to a character */
12380 long x;
12381 x = PyLong_AsLong(v);
12382 if (x == -1 && PyErr_Occurred())
12383 goto onError;
12384
12385 if (x < 0 || x > 0x10ffff) {
12386 PyErr_SetString(PyExc_OverflowError,
12387 "%c arg not in range(0x110000)");
12388 return -1;
12389 }
12390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 buf[1] = '\0';
12393 return 1;
12394 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012395
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012397 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012399 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400}
12401
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012402/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012403 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012404*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012405#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012406
Alexander Belopolsky40018472011-02-26 01:02:56 +000012407PyObject *
12408PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 void *fmt;
12411 int fmtkind;
12412 PyObject *result;
12413 Py_UCS4 *res, *res0;
12414 Py_UCS4 max;
12415 int kind;
12416 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012420
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 PyErr_BadInternalCall();
12423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12426 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012427 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 fmt = PyUnicode_DATA(uformat);
12429 fmtkind = PyUnicode_KIND(uformat);
12430 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12431 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432
12433 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12435 if (res0 == NULL) {
12436 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439
12440 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 arglen = PyTuple_Size(args);
12442 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443 }
12444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 arglen = -1;
12446 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012448 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012449 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012450 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
12452 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 if (--rescnt < 0) {
12455 rescnt = fmtcnt + 100;
12456 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12458 if (res0 == NULL){
12459 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 }
12462 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 }
12467 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 /* Got a format specifier */
12469 int flags = 0;
12470 Py_ssize_t width = -1;
12471 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 Py_UCS4 c = '\0';
12473 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012474 int isnumok;
12475 PyObject *v = NULL;
12476 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 void *pbuf;
12478 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 Py_ssize_t len, len1;
12481 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 fmtpos++;
12484 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12485 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 Py_ssize_t keylen;
12487 PyObject *key;
12488 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012489
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 if (dict == NULL) {
12491 PyErr_SetString(PyExc_TypeError,
12492 "format requires a mapping");
12493 goto onError;
12494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 /* Skip over balanced parentheses */
12499 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 if (fmtcnt < 0 || pcount > 0) {
12508 PyErr_SetString(PyExc_ValueError,
12509 "incomplete format key");
12510 goto onError;
12511 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012512 key = PyUnicode_Substring((PyObject*)uformat,
12513 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 if (key == NULL)
12515 goto onError;
12516 if (args_owned) {
12517 Py_DECREF(args);
12518 args_owned = 0;
12519 }
12520 args = PyObject_GetItem(dict, key);
12521 Py_DECREF(key);
12522 if (args == NULL) {
12523 goto onError;
12524 }
12525 args_owned = 1;
12526 arglen = -1;
12527 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012528 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 case '-': flags |= F_LJUST; continue;
12532 case '+': flags |= F_SIGN; continue;
12533 case ' ': flags |= F_BLANK; continue;
12534 case '#': flags |= F_ALT; continue;
12535 case '0': flags |= F_ZERO; continue;
12536 }
12537 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012538 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 if (c == '*') {
12540 v = getnextarg(args, arglen, &argidx);
12541 if (v == NULL)
12542 goto onError;
12543 if (!PyLong_Check(v)) {
12544 PyErr_SetString(PyExc_TypeError,
12545 "* wants int");
12546 goto onError;
12547 }
12548 width = PyLong_AsLong(v);
12549 if (width == -1 && PyErr_Occurred())
12550 goto onError;
12551 if (width < 0) {
12552 flags |= F_LJUST;
12553 width = -width;
12554 }
12555 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 }
12558 else if (c >= '0' && c <= '9') {
12559 width = c - '0';
12560 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 if (c < '0' || c > '9')
12563 break;
12564 if ((width*10) / 10 != width) {
12565 PyErr_SetString(PyExc_ValueError,
12566 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012567 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 }
12569 width = width*10 + (c - '0');
12570 }
12571 }
12572 if (c == '.') {
12573 prec = 0;
12574 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 if (c == '*') {
12577 v = getnextarg(args, arglen, &argidx);
12578 if (v == NULL)
12579 goto onError;
12580 if (!PyLong_Check(v)) {
12581 PyErr_SetString(PyExc_TypeError,
12582 "* wants int");
12583 goto onError;
12584 }
12585 prec = PyLong_AsLong(v);
12586 if (prec == -1 && PyErr_Occurred())
12587 goto onError;
12588 if (prec < 0)
12589 prec = 0;
12590 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 }
12593 else if (c >= '0' && c <= '9') {
12594 prec = c - '0';
12595 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 if (c < '0' || c > '9')
12598 break;
12599 if ((prec*10) / 10 != prec) {
12600 PyErr_SetString(PyExc_ValueError,
12601 "prec too big");
12602 goto onError;
12603 }
12604 prec = prec*10 + (c - '0');
12605 }
12606 }
12607 } /* prec */
12608 if (fmtcnt >= 0) {
12609 if (c == 'h' || c == 'l' || c == 'L') {
12610 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 }
12613 }
12614 if (fmtcnt < 0) {
12615 PyErr_SetString(PyExc_ValueError,
12616 "incomplete format");
12617 goto onError;
12618 }
12619 if (c != '%') {
12620 v = getnextarg(args, arglen, &argidx);
12621 if (v == NULL)
12622 goto onError;
12623 }
12624 sign = 0;
12625 fill = ' ';
12626 switch (c) {
12627
12628 case '%':
12629 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 len = 1;
12634 break;
12635
12636 case 's':
12637 case 'r':
12638 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012639 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 temp = v;
12641 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012642 }
12643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 if (c == 's')
12645 temp = PyObject_Str(v);
12646 else if (c == 'r')
12647 temp = PyObject_Repr(v);
12648 else
12649 temp = PyObject_ASCII(v);
12650 if (temp == NULL)
12651 goto onError;
12652 if (PyUnicode_Check(temp))
12653 /* nothing to do */;
12654 else {
12655 Py_DECREF(temp);
12656 PyErr_SetString(PyExc_TypeError,
12657 "%s argument has non-string str()");
12658 goto onError;
12659 }
12660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 if (PyUnicode_READY(temp) == -1) {
12662 Py_CLEAR(temp);
12663 goto onError;
12664 }
12665 pbuf = PyUnicode_DATA(temp);
12666 kind = PyUnicode_KIND(temp);
12667 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 if (prec >= 0 && len > prec)
12669 len = prec;
12670 break;
12671
12672 case 'i':
12673 case 'd':
12674 case 'u':
12675 case 'o':
12676 case 'x':
12677 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 isnumok = 0;
12679 if (PyNumber_Check(v)) {
12680 PyObject *iobj=NULL;
12681
12682 if (PyLong_Check(v)) {
12683 iobj = v;
12684 Py_INCREF(iobj);
12685 }
12686 else {
12687 iobj = PyNumber_Long(v);
12688 }
12689 if (iobj!=NULL) {
12690 if (PyLong_Check(iobj)) {
12691 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012692 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 Py_DECREF(iobj);
12694 if (!temp)
12695 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (PyUnicode_READY(temp) == -1) {
12697 Py_CLEAR(temp);
12698 goto onError;
12699 }
12700 pbuf = PyUnicode_DATA(temp);
12701 kind = PyUnicode_KIND(temp);
12702 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 sign = 1;
12704 }
12705 else {
12706 Py_DECREF(iobj);
12707 }
12708 }
12709 }
12710 if (!isnumok) {
12711 PyErr_Format(PyExc_TypeError,
12712 "%%%c format: a number is required, "
12713 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12714 goto onError;
12715 }
12716 if (flags & F_ZERO)
12717 fill = '0';
12718 break;
12719
12720 case 'e':
12721 case 'E':
12722 case 'f':
12723 case 'F':
12724 case 'g':
12725 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012726 temp = formatfloat(v, flags, prec, c);
12727 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 if (PyUnicode_READY(temp) == -1) {
12730 Py_CLEAR(temp);
12731 goto onError;
12732 }
12733 pbuf = PyUnicode_DATA(temp);
12734 kind = PyUnicode_KIND(temp);
12735 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 sign = 1;
12737 if (flags & F_ZERO)
12738 fill = '0';
12739 break;
12740
12741 case 'c':
12742 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012744 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 if (len < 0)
12746 goto onError;
12747 break;
12748
12749 default:
12750 PyErr_Format(PyExc_ValueError,
12751 "unsupported format character '%c' (0x%x) "
12752 "at index %zd",
12753 (31<=c && c<=126) ? (char)c : '?',
12754 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 goto onError;
12757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 /* pbuf is initialized here. */
12759 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12762 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12763 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 len--;
12765 }
12766 else if (flags & F_SIGN)
12767 sign = '+';
12768 else if (flags & F_BLANK)
12769 sign = ' ';
12770 else
12771 sign = 0;
12772 }
12773 if (width < len)
12774 width = len;
12775 if (rescnt - (sign != 0) < width) {
12776 reslen -= rescnt;
12777 rescnt = width + fmtcnt + 100;
12778 reslen += rescnt;
12779 if (reslen < 0) {
12780 Py_XDECREF(temp);
12781 PyErr_NoMemory();
12782 goto onError;
12783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12785 if (res0 == 0) {
12786 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 Py_XDECREF(temp);
12788 goto onError;
12789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 }
12792 if (sign) {
12793 if (fill != ' ')
12794 *res++ = sign;
12795 rescnt--;
12796 if (width > len)
12797 width--;
12798 }
12799 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12801 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12804 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 }
12806 rescnt -= 2;
12807 width -= 2;
12808 if (width < 0)
12809 width = 0;
12810 len -= 2;
12811 }
12812 if (width > len && !(flags & F_LJUST)) {
12813 do {
12814 --rescnt;
12815 *res++ = fill;
12816 } while (--width > len);
12817 }
12818 if (fill == ' ') {
12819 if (sign)
12820 *res++ = sign;
12821 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12823 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12824 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12825 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 }
12827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 /* Copy all characters, preserving len */
12829 len1 = len;
12830 while (len1--) {
12831 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12832 rescnt--;
12833 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 while (--width >= len) {
12835 --rescnt;
12836 *res++ = ' ';
12837 }
12838 if (dict && (argidx < arglen) && c != '%') {
12839 PyErr_SetString(PyExc_TypeError,
12840 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012841 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 goto onError;
12843 }
12844 Py_XDECREF(temp);
12845 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 } /* until end */
12847 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 PyErr_SetString(PyExc_TypeError,
12849 "not all arguments converted during string formatting");
12850 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851 }
12852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853
12854 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12855 if (*res > max)
12856 max = *res;
12857 result = PyUnicode_New(reslen - rescnt, max);
12858 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 kind = PyUnicode_KIND(result);
12861 for (res = res0; res < res0+reslen-rescnt; res++)
12862 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12863 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866 }
12867 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868 return (PyObject *)result;
12869
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872 Py_DECREF(uformat);
12873 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875 }
12876 return NULL;
12877}
12878
Jeremy Hylton938ace62002-07-17 16:30:39 +000012879static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012880unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12881
Tim Peters6d6c1a32001-08-02 04:15:00 +000012882static PyObject *
12883unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12884{
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 static char *kwlist[] = {"object", "encoding", "errors", 0};
12887 char *encoding = NULL;
12888 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012889
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 if (type != &PyUnicode_Type)
12891 return unicode_subtype_new(type, args, kwds);
12892 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012894 return NULL;
12895 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012897 if (encoding == NULL && errors == NULL)
12898 return PyObject_Str(x);
12899 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012901}
12902
Guido van Rossume023fe02001-08-30 03:12:59 +000012903static PyObject *
12904unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12905{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012906 PyUnicodeObject *unicode, *self;
12907 Py_ssize_t length, char_size;
12908 int share_wstr, share_utf8;
12909 unsigned int kind;
12910 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012911
Benjamin Peterson14339b62009-01-31 16:36:08 +000012912 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012913
12914 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12915 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012916 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012917 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012918 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012919 return NULL;
12920
12921 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12922 if (self == NULL) {
12923 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012924 return NULL;
12925 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012926 kind = PyUnicode_KIND(unicode);
12927 length = PyUnicode_GET_LENGTH(unicode);
12928
12929 _PyUnicode_LENGTH(self) = length;
12930 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12931 _PyUnicode_STATE(self).interned = 0;
12932 _PyUnicode_STATE(self).kind = kind;
12933 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012934 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012935 _PyUnicode_STATE(self).ready = 1;
12936 _PyUnicode_WSTR(self) = NULL;
12937 _PyUnicode_UTF8_LENGTH(self) = 0;
12938 _PyUnicode_UTF8(self) = NULL;
12939 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012940 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012941
12942 share_utf8 = 0;
12943 share_wstr = 0;
12944 if (kind == PyUnicode_1BYTE_KIND) {
12945 char_size = 1;
12946 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12947 share_utf8 = 1;
12948 }
12949 else if (kind == PyUnicode_2BYTE_KIND) {
12950 char_size = 2;
12951 if (sizeof(wchar_t) == 2)
12952 share_wstr = 1;
12953 }
12954 else {
12955 assert(kind == PyUnicode_4BYTE_KIND);
12956 char_size = 4;
12957 if (sizeof(wchar_t) == 4)
12958 share_wstr = 1;
12959 }
12960
12961 /* Ensure we won't overflow the length. */
12962 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12963 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012965 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012966 data = PyObject_MALLOC((length + 1) * char_size);
12967 if (data == NULL) {
12968 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 goto onError;
12970 }
12971
Victor Stinnerc3c74152011-10-02 20:39:55 +020012972 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012973 if (share_utf8) {
12974 _PyUnicode_UTF8_LENGTH(self) = length;
12975 _PyUnicode_UTF8(self) = data;
12976 }
12977 if (share_wstr) {
12978 _PyUnicode_WSTR_LENGTH(self) = length;
12979 _PyUnicode_WSTR(self) = (wchar_t *)data;
12980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012982 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12983 PyUnicode_KIND_SIZE(kind, length + 1));
12984 Py_DECREF(unicode);
12985 return (PyObject *)self;
12986
12987onError:
12988 Py_DECREF(unicode);
12989 Py_DECREF(self);
12990 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012991}
12992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012993PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012995\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012996Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012997encoding defaults to the current default string encoding.\n\
12998errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012999
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013000static PyObject *unicode_iter(PyObject *seq);
13001
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013003 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 "str", /* tp_name */
13005 sizeof(PyUnicodeObject), /* tp_size */
13006 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013008 (destructor)unicode_dealloc, /* tp_dealloc */
13009 0, /* tp_print */
13010 0, /* tp_getattr */
13011 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013012 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013013 unicode_repr, /* tp_repr */
13014 &unicode_as_number, /* tp_as_number */
13015 &unicode_as_sequence, /* tp_as_sequence */
13016 &unicode_as_mapping, /* tp_as_mapping */
13017 (hashfunc) unicode_hash, /* tp_hash*/
13018 0, /* tp_call*/
13019 (reprfunc) unicode_str, /* tp_str */
13020 PyObject_GenericGetAttr, /* tp_getattro */
13021 0, /* tp_setattro */
13022 0, /* tp_as_buffer */
13023 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013024 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 unicode_doc, /* tp_doc */
13026 0, /* tp_traverse */
13027 0, /* tp_clear */
13028 PyUnicode_RichCompare, /* tp_richcompare */
13029 0, /* tp_weaklistoffset */
13030 unicode_iter, /* tp_iter */
13031 0, /* tp_iternext */
13032 unicode_methods, /* tp_methods */
13033 0, /* tp_members */
13034 0, /* tp_getset */
13035 &PyBaseObject_Type, /* tp_base */
13036 0, /* tp_dict */
13037 0, /* tp_descr_get */
13038 0, /* tp_descr_set */
13039 0, /* tp_dictoffset */
13040 0, /* tp_init */
13041 0, /* tp_alloc */
13042 unicode_new, /* tp_new */
13043 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044};
13045
13046/* Initialize the Unicode implementation */
13047
Thomas Wouters78890102000-07-22 19:25:51 +000013048void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013050 int i;
13051
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054 0x000A, /* LINE FEED */
13055 0x000D, /* CARRIAGE RETURN */
13056 0x001C, /* FILE SEPARATOR */
13057 0x001D, /* GROUP SEPARATOR */
13058 0x001E, /* RECORD SEPARATOR */
13059 0x0085, /* NEXT LINE */
13060 0x2028, /* LINE SEPARATOR */
13061 0x2029, /* PARAGRAPH SEPARATOR */
13062 };
13063
Fred Drakee4315f52000-05-09 19:53:39 +000013064 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013065 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013066 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013068
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013069 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013071 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013073
13074 /* initialize the linebreak bloom filter */
13075 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013077 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013078
13079 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080}
13081
13082/* Finalize the Unicode implementation */
13083
Christian Heimesa156e092008-02-16 07:38:31 +000013084int
13085PyUnicode_ClearFreeList(void)
13086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013088}
13089
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090void
Thomas Wouters78890102000-07-22 19:25:51 +000013091_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013093 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013095 Py_XDECREF(unicode_empty);
13096 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013097
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013098 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013099 if (unicode_latin1[i]) {
13100 Py_DECREF(unicode_latin1[i]);
13101 unicode_latin1[i] = NULL;
13102 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013103 }
Christian Heimesa156e092008-02-16 07:38:31 +000013104 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013106
Walter Dörwald16807132007-05-25 13:52:07 +000013107void
13108PyUnicode_InternInPlace(PyObject **p)
13109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013110 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13111 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013112#ifdef Py_DEBUG
13113 assert(s != NULL);
13114 assert(_PyUnicode_CHECK(s));
13115#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013117 return;
13118#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119 /* If it's a subclass, we don't really know what putting
13120 it in the interned dict might do. */
13121 if (!PyUnicode_CheckExact(s))
13122 return;
13123 if (PyUnicode_CHECK_INTERNED(s))
13124 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013125 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013126 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 return;
13128 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013129 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 if (interned == NULL) {
13131 interned = PyDict_New();
13132 if (interned == NULL) {
13133 PyErr_Clear(); /* Don't leave an exception */
13134 return;
13135 }
13136 }
13137 /* It might be that the GetItem call fails even
13138 though the key is present in the dictionary,
13139 namely when this happens during a stack overflow. */
13140 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013143
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 if (t) {
13145 Py_INCREF(t);
13146 Py_DECREF(*p);
13147 *p = t;
13148 return;
13149 }
Walter Dörwald16807132007-05-25 13:52:07 +000013150
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 PyThreadState_GET()->recursion_critical = 1;
13152 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13153 PyErr_Clear();
13154 PyThreadState_GET()->recursion_critical = 0;
13155 return;
13156 }
13157 PyThreadState_GET()->recursion_critical = 0;
13158 /* The two references in interned are not counted by refcnt.
13159 The deallocator will take care of this */
13160 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013162}
13163
13164void
13165PyUnicode_InternImmortal(PyObject **p)
13166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13168
Benjamin Peterson14339b62009-01-31 16:36:08 +000013169 PyUnicode_InternInPlace(p);
13170 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 Py_INCREF(*p);
13173 }
Walter Dörwald16807132007-05-25 13:52:07 +000013174}
13175
13176PyObject *
13177PyUnicode_InternFromString(const char *cp)
13178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013179 PyObject *s = PyUnicode_FromString(cp);
13180 if (s == NULL)
13181 return NULL;
13182 PyUnicode_InternInPlace(&s);
13183 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013184}
13185
Alexander Belopolsky40018472011-02-26 01:02:56 +000013186void
13187_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013188{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013189 PyObject *keys;
13190 PyUnicodeObject *s;
13191 Py_ssize_t i, n;
13192 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013193
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 if (interned == NULL || !PyDict_Check(interned))
13195 return;
13196 keys = PyDict_Keys(interned);
13197 if (keys == NULL || !PyList_Check(keys)) {
13198 PyErr_Clear();
13199 return;
13200 }
Walter Dörwald16807132007-05-25 13:52:07 +000013201
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13203 detector, interned unicode strings are not forcibly deallocated;
13204 rather, we give them their stolen references back, and then clear
13205 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013206
Benjamin Peterson14339b62009-01-31 16:36:08 +000013207 n = PyList_GET_SIZE(keys);
13208 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 for (i = 0; i < n; i++) {
13211 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 if (PyUnicode_READY(s) == -1)
13213 fprintf(stderr, "could not ready string\n");
13214 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 case SSTATE_NOT_INTERNED:
13216 /* XXX Shouldn't happen */
13217 break;
13218 case SSTATE_INTERNED_IMMORTAL:
13219 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013221 break;
13222 case SSTATE_INTERNED_MORTAL:
13223 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013225 break;
13226 default:
13227 Py_FatalError("Inconsistent interned string state.");
13228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013230 }
13231 fprintf(stderr, "total size of all interned strings: "
13232 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13233 "mortal/immortal\n", mortal_size, immortal_size);
13234 Py_DECREF(keys);
13235 PyDict_Clear(interned);
13236 Py_DECREF(interned);
13237 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013238}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013239
13240
13241/********************* Unicode Iterator **************************/
13242
13243typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 PyObject_HEAD
13245 Py_ssize_t it_index;
13246 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013247} unicodeiterobject;
13248
13249static void
13250unicodeiter_dealloc(unicodeiterobject *it)
13251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 _PyObject_GC_UNTRACK(it);
13253 Py_XDECREF(it->it_seq);
13254 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013255}
13256
13257static int
13258unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13259{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013260 Py_VISIT(it->it_seq);
13261 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013262}
13263
13264static PyObject *
13265unicodeiter_next(unicodeiterobject *it)
13266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 PyUnicodeObject *seq;
13268 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013269
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 assert(it != NULL);
13271 seq = it->it_seq;
13272 if (seq == NULL)
13273 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013274 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13277 int kind = PyUnicode_KIND(seq);
13278 void *data = PyUnicode_DATA(seq);
13279 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13280 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 if (item != NULL)
13282 ++it->it_index;
13283 return item;
13284 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013285
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 Py_DECREF(seq);
13287 it->it_seq = NULL;
13288 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013289}
13290
13291static PyObject *
13292unicodeiter_len(unicodeiterobject *it)
13293{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 Py_ssize_t len = 0;
13295 if (it->it_seq)
13296 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13297 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013298}
13299
13300PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13301
13302static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013305 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013306};
13307
13308PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13310 "str_iterator", /* tp_name */
13311 sizeof(unicodeiterobject), /* tp_basicsize */
13312 0, /* tp_itemsize */
13313 /* methods */
13314 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13315 0, /* tp_print */
13316 0, /* tp_getattr */
13317 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013318 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013319 0, /* tp_repr */
13320 0, /* tp_as_number */
13321 0, /* tp_as_sequence */
13322 0, /* tp_as_mapping */
13323 0, /* tp_hash */
13324 0, /* tp_call */
13325 0, /* tp_str */
13326 PyObject_GenericGetAttr, /* tp_getattro */
13327 0, /* tp_setattro */
13328 0, /* tp_as_buffer */
13329 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13330 0, /* tp_doc */
13331 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13332 0, /* tp_clear */
13333 0, /* tp_richcompare */
13334 0, /* tp_weaklistoffset */
13335 PyObject_SelfIter, /* tp_iter */
13336 (iternextfunc)unicodeiter_next, /* tp_iternext */
13337 unicodeiter_methods, /* tp_methods */
13338 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013339};
13340
13341static PyObject *
13342unicode_iter(PyObject *seq)
13343{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013344 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013345
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346 if (!PyUnicode_Check(seq)) {
13347 PyErr_BadInternalCall();
13348 return NULL;
13349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 if (PyUnicode_READY(seq) == -1)
13351 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013352 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13353 if (it == NULL)
13354 return NULL;
13355 it->it_index = 0;
13356 Py_INCREF(seq);
13357 it->it_seq = (PyUnicodeObject *)seq;
13358 _PyObject_GC_TRACK(it);
13359 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013360}
13361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362#define UNIOP(x) Py_UNICODE_##x
13363#define UNIOP_t Py_UNICODE
13364#include "uniops.h"
13365#undef UNIOP
13366#undef UNIOP_t
13367#define UNIOP(x) Py_UCS4_##x
13368#define UNIOP_t Py_UCS4
13369#include "uniops.h"
13370#undef UNIOP
13371#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013372
Victor Stinner71133ff2010-09-01 23:43:53 +000013373Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013374PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013375{
13376 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13377 Py_UNICODE *copy;
13378 Py_ssize_t size;
13379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013380 if (!PyUnicode_Check(unicode)) {
13381 PyErr_BadArgument();
13382 return NULL;
13383 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013384 /* Ensure we won't overflow the size. */
13385 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13386 PyErr_NoMemory();
13387 return NULL;
13388 }
13389 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13390 size *= sizeof(Py_UNICODE);
13391 copy = PyMem_Malloc(size);
13392 if (copy == NULL) {
13393 PyErr_NoMemory();
13394 return NULL;
13395 }
13396 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13397 return copy;
13398}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013399
Georg Brandl66c221e2010-10-14 07:04:07 +000013400/* A _string module, to export formatter_parser and formatter_field_name_split
13401 to the string.Formatter class implemented in Python. */
13402
13403static PyMethodDef _string_methods[] = {
13404 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13405 METH_O, PyDoc_STR("split the argument as a field name")},
13406 {"formatter_parser", (PyCFunction) formatter_parser,
13407 METH_O, PyDoc_STR("parse the argument as a format string")},
13408 {NULL, NULL}
13409};
13410
13411static struct PyModuleDef _string_module = {
13412 PyModuleDef_HEAD_INIT,
13413 "_string",
13414 PyDoc_STR("string helper module"),
13415 0,
13416 _string_methods,
13417 NULL,
13418 NULL,
13419 NULL,
13420 NULL
13421};
13422
13423PyMODINIT_FUNC
13424PyInit__string(void)
13425{
13426 return PyModule_Create(&_string_module);
13427}
13428
13429
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013430#ifdef __cplusplus
13431}
13432#endif