blob: f3a5dd9b578c562c4b0409fba2674bd3df9bd960 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
972 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001018 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001019 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001020 /* slow path to check for character overflow */
1021 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1022 Py_UCS4 ch, maxchar;
1023 Py_ssize_t i;
1024
1025 maxchar = 0;
1026 invalid_kinds = 0;
1027 for (i=0; i < how_many; i++) {
1028 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1029 if (ch > maxchar) {
1030 maxchar = ch;
1031 if (maxchar > to_maxchar) {
1032 invalid_kinds = 1;
1033 break;
1034 }
1035 }
1036 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1037 }
1038 }
1039 else
1040 invalid_kinds = 1;
1041 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001042 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 "Cannot copy %s characters "
1044 "into a string of %s characters",
1045 unicode_kind_name(from),
1046 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001047 return -1;
1048 }
1049 }
1050 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051}
1052
Victor Stinner17222162011-09-28 22:15:37 +02001053/* Find the maximum code point and count the number of surrogate pairs so a
1054 correct string length can be computed before converting a string to UCS4.
1055 This function counts single surrogates as a character and not as a pair.
1056
1057 Return 0 on success, or -1 on error. */
1058static int
1059find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1060 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061{
1062 const wchar_t *iter;
1063
Victor Stinnerc53be962011-10-02 21:33:54 +02001064 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 if (num_surrogates == NULL || maxchar == NULL) {
1066 PyErr_SetString(PyExc_SystemError,
1067 "unexpected NULL arguments to "
1068 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1069 return -1;
1070 }
1071
1072 *num_surrogates = 0;
1073 *maxchar = 0;
1074
1075 for (iter = begin; iter < end; ) {
1076 if (*iter > *maxchar)
1077 *maxchar = *iter;
1078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
1518_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject *res;
1521 unsigned char max = 127;
1522 Py_ssize_t i;
1523 for (i = 0; i < size; i++) {
1524 if (u[i] & 0x80) {
1525 max = 255;
1526 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001527 }
1528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 res = PyUnicode_New(size, max);
1530 if (!res)
1531 return NULL;
1532 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1533 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001534}
1535
Victor Stinnere57b1c02011-09-28 22:20:48 +02001536static PyObject*
1537_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538{
1539 PyObject *res;
1540 Py_UCS2 max = 0;
1541 Py_ssize_t i;
1542 for (i = 0; i < size; i++)
1543 if (u[i] > max)
1544 max = u[i];
1545 res = PyUnicode_New(size, max);
1546 if (!res)
1547 return NULL;
1548 if (max >= 256)
1549 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1550 else
1551 for (i = 0; i < size; i++)
1552 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1553 return res;
1554}
1555
Victor Stinnere57b1c02011-09-28 22:20:48 +02001556static PyObject*
1557_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558{
1559 PyObject *res;
1560 Py_UCS4 max = 0;
1561 Py_ssize_t i;
1562 for (i = 0; i < size; i++)
1563 if (u[i] > max)
1564 max = u[i];
1565 res = PyUnicode_New(size, max);
1566 if (!res)
1567 return NULL;
1568 if (max >= 0x10000)
1569 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1570 else {
1571 int kind = PyUnicode_KIND(res);
1572 void *data = PyUnicode_DATA(res);
1573 for (i = 0; i < size; i++)
1574 PyUnicode_WRITE(kind, data, i, u[i]);
1575 }
1576 return res;
1577}
1578
1579PyObject*
1580PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1581{
1582 switch(kind) {
1583 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001584 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001586 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001588 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 }
Victor Stinner01698042011-10-04 00:04:26 +02001590 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 return NULL;
1592}
1593
Victor Stinner034f6cf2011-09-30 02:26:44 +02001594PyObject*
1595PyUnicode_Copy(PyObject *unicode)
1596{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001597 Py_ssize_t size;
1598 PyObject *copy;
1599 void *data;
1600
Victor Stinner034f6cf2011-09-30 02:26:44 +02001601 if (!PyUnicode_Check(unicode)) {
1602 PyErr_BadInternalCall();
1603 return NULL;
1604 }
1605 if (PyUnicode_READY(unicode))
1606 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001607
1608 size = PyUnicode_GET_LENGTH(unicode);
1609 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1610 if (!copy)
1611 return NULL;
1612 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1613
1614 data = PyUnicode_DATA(unicode);
1615 switch (PyUnicode_KIND(unicode))
1616 {
1617 case PyUnicode_1BYTE_KIND:
1618 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1619 break;
1620 case PyUnicode_2BYTE_KIND:
1621 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1622 break;
1623 case PyUnicode_4BYTE_KIND:
1624 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1625 break;
1626 default:
1627 assert(0);
1628 break;
1629 }
1630 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001631}
1632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634/* Widen Unicode objects to larger buffers. Don't write terminating null
1635 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636
1637void*
1638_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1639{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001640 Py_ssize_t len;
1641 void *result;
1642 unsigned int skind;
1643
1644 if (PyUnicode_READY(s))
1645 return NULL;
1646
1647 len = PyUnicode_GET_LENGTH(s);
1648 skind = PyUnicode_KIND(s);
1649 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001650 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return NULL;
1652 }
1653 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001654 case PyUnicode_2BYTE_KIND:
1655 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1656 if (!result)
1657 return PyErr_NoMemory();
1658 assert(skind == PyUnicode_1BYTE_KIND);
1659 _PyUnicode_CONVERT_BYTES(
1660 Py_UCS1, Py_UCS2,
1661 PyUnicode_1BYTE_DATA(s),
1662 PyUnicode_1BYTE_DATA(s) + len,
1663 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001665 case PyUnicode_4BYTE_KIND:
1666 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1667 if (!result)
1668 return PyErr_NoMemory();
1669 if (skind == PyUnicode_2BYTE_KIND) {
1670 _PyUnicode_CONVERT_BYTES(
1671 Py_UCS2, Py_UCS4,
1672 PyUnicode_2BYTE_DATA(s),
1673 PyUnicode_2BYTE_DATA(s) + len,
1674 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676 else {
1677 assert(skind == PyUnicode_1BYTE_KIND);
1678 _PyUnicode_CONVERT_BYTES(
1679 Py_UCS1, Py_UCS4,
1680 PyUnicode_1BYTE_DATA(s),
1681 PyUnicode_1BYTE_DATA(s) + len,
1682 result);
1683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001685 default:
1686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
Victor Stinner01698042011-10-04 00:04:26 +02001688 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 return NULL;
1690}
1691
1692static Py_UCS4*
1693as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1694 int copy_null)
1695{
1696 int kind;
1697 void *data;
1698 Py_ssize_t len, targetlen;
1699 if (PyUnicode_READY(string) == -1)
1700 return NULL;
1701 kind = PyUnicode_KIND(string);
1702 data = PyUnicode_DATA(string);
1703 len = PyUnicode_GET_LENGTH(string);
1704 targetlen = len;
1705 if (copy_null)
1706 targetlen++;
1707 if (!target) {
1708 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1709 PyErr_NoMemory();
1710 return NULL;
1711 }
1712 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1713 if (!target) {
1714 PyErr_NoMemory();
1715 return NULL;
1716 }
1717 }
1718 else {
1719 if (targetsize < targetlen) {
1720 PyErr_Format(PyExc_SystemError,
1721 "string is longer than the buffer");
1722 if (copy_null && 0 < targetsize)
1723 target[0] = 0;
1724 return NULL;
1725 }
1726 }
1727 if (kind != PyUnicode_4BYTE_KIND) {
1728 Py_ssize_t i;
1729 for (i = 0; i < len; i++)
1730 target[i] = PyUnicode_READ(kind, data, i);
1731 }
1732 else
1733 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1734 if (copy_null)
1735 target[len] = 0;
1736 return target;
1737}
1738
1739Py_UCS4*
1740PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1741 int copy_null)
1742{
1743 if (target == NULL || targetsize < 1) {
1744 PyErr_BadInternalCall();
1745 return NULL;
1746 }
1747 return as_ucs4(string, target, targetsize, copy_null);
1748}
1749
1750Py_UCS4*
1751PyUnicode_AsUCS4Copy(PyObject *string)
1752{
1753 return as_ucs4(string, NULL, 0, 1);
1754}
1755
1756#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757
Alexander Belopolsky40018472011-02-26 01:02:56 +00001758PyObject *
1759PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001762 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 PyErr_BadInternalCall();
1765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
1767
Martin v. Löwis790465f2008-04-05 20:41:37 +00001768 if (size == -1) {
1769 size = wcslen(w);
1770 }
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776
Walter Dörwald346737f2007-05-31 10:44:43 +00001777static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001778makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1779 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 *fmt++ = '%';
1782 if (width) {
1783 if (zeropad)
1784 *fmt++ = '0';
1785 fmt += sprintf(fmt, "%d", width);
1786 }
1787 if (precision)
1788 fmt += sprintf(fmt, ".%d", precision);
1789 if (longflag)
1790 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001791 else if (longlongflag) {
1792 /* longlongflag should only ever be nonzero on machines with
1793 HAVE_LONG_LONG defined */
1794#ifdef HAVE_LONG_LONG
1795 char *f = PY_FORMAT_LONG_LONG;
1796 while (*f)
1797 *fmt++ = *f++;
1798#else
1799 /* we shouldn't ever get here */
1800 assert(0);
1801 *fmt++ = 'l';
1802#endif
1803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 else if (size_tflag) {
1805 char *f = PY_FORMAT_SIZE_T;
1806 while (*f)
1807 *fmt++ = *f++;
1808 }
1809 *fmt++ = c;
1810 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001811}
1812
Victor Stinner96865452011-03-01 23:44:09 +00001813/* helper for PyUnicode_FromFormatV() */
1814
1815static const char*
1816parse_format_flags(const char *f,
1817 int *p_width, int *p_precision,
1818 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1819{
1820 int width, precision, longflag, longlongflag, size_tflag;
1821
1822 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1823 f++;
1824 width = 0;
1825 while (Py_ISDIGIT((unsigned)*f))
1826 width = (width*10) + *f++ - '0';
1827 precision = 0;
1828 if (*f == '.') {
1829 f++;
1830 while (Py_ISDIGIT((unsigned)*f))
1831 precision = (precision*10) + *f++ - '0';
1832 if (*f == '%') {
1833 /* "%.3%s" => f points to "3" */
1834 f--;
1835 }
1836 }
1837 if (*f == '\0') {
1838 /* bogus format "%.1" => go backward, f points to "1" */
1839 f--;
1840 }
1841 if (p_width != NULL)
1842 *p_width = width;
1843 if (p_precision != NULL)
1844 *p_precision = precision;
1845
1846 /* Handle %ld, %lu, %lld and %llu. */
1847 longflag = 0;
1848 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001849 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001850
1851 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001853 longflag = 1;
1854 ++f;
1855 }
1856#ifdef HAVE_LONG_LONG
1857 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001858 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001859 longlongflag = 1;
1860 f += 2;
1861 }
1862#endif
1863 }
1864 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001866 size_tflag = 1;
1867 ++f;
1868 }
1869 if (p_longflag != NULL)
1870 *p_longflag = longflag;
1871 if (p_longlongflag != NULL)
1872 *p_longlongflag = longlongflag;
1873 if (p_size_tflag != NULL)
1874 *p_size_tflag = size_tflag;
1875 return f;
1876}
1877
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001878/* maximum number of characters required for output of %ld. 21 characters
1879 allows for 64-bit integers (in decimal) and an optional sign. */
1880#define MAX_LONG_CHARS 21
1881/* maximum number of characters required for output of %lld.
1882 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1883 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1884#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1885
Walter Dörwaldd2034312007-05-18 16:29:38 +00001886PyObject *
1887PyUnicode_FromFormatV(const char *format, va_list vargs)
1888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 va_list count;
1890 Py_ssize_t callcount = 0;
1891 PyObject **callresults = NULL;
1892 PyObject **callresult = NULL;
1893 Py_ssize_t n = 0;
1894 int width = 0;
1895 int precision = 0;
1896 int zeropad;
1897 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001900 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1902 Py_UCS4 argmaxchar;
1903 Py_ssize_t numbersize = 0;
1904 char *numberresults = NULL;
1905 char *numberresult = NULL;
1906 Py_ssize_t i;
1907 int kind;
1908 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001910 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001911 /* step 1: count the number of %S/%R/%A/%s format specifications
1912 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1913 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 * result in an array)
1915 * also esimate a upper bound for all the number formats in the string,
1916 * numbers will be formated in step 3 and be keept in a '\0'-separated
1917 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 for (f = format; *f; f++) {
1919 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001920 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1922 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1923 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1924 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001927#ifdef HAVE_LONG_LONG
1928 if (longlongflag) {
1929 if (width < MAX_LONG_LONG_CHARS)
1930 width = MAX_LONG_LONG_CHARS;
1931 }
1932 else
1933#endif
1934 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1935 including sign. Decimal takes the most space. This
1936 isn't enough for octal. If a width is specified we
1937 need more (which we allocate later). */
1938 if (width < MAX_LONG_CHARS)
1939 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940
1941 /* account for the size + '\0' to separate numbers
1942 inside of the numberresults buffer */
1943 numbersize += (width + 1);
1944 }
1945 }
1946 else if ((unsigned char)*f > 127) {
1947 PyErr_Format(PyExc_ValueError,
1948 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1949 "string, got a non-ASCII byte: 0x%02x",
1950 (unsigned char)*f);
1951 return NULL;
1952 }
1953 }
1954 /* step 2: allocate memory for the results of
1955 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1956 if (callcount) {
1957 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1958 if (!callresults) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 callresult = callresults;
1963 }
1964 /* step 2.5: allocate memory for the results of formating numbers */
1965 if (numbersize) {
1966 numberresults = PyObject_Malloc(numbersize);
1967 if (!numberresults) {
1968 PyErr_NoMemory();
1969 goto fail;
1970 }
1971 numberresult = numberresults;
1972 }
1973
1974 /* step 3: format numbers and figure out how large a buffer we need */
1975 for (f = format; *f; f++) {
1976 if (*f == '%') {
1977 const char* p;
1978 int longflag;
1979 int longlongflag;
1980 int size_tflag;
1981 int numprinted;
1982
1983 p = f;
1984 zeropad = (f[1] == '0');
1985 f = parse_format_flags(f, &width, &precision,
1986 &longflag, &longlongflag, &size_tflag);
1987 switch (*f) {
1988 case 'c':
1989 {
1990 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001991 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 n++;
1993 break;
1994 }
1995 case '%':
1996 n++;
1997 break;
1998 case 'i':
1999 case 'd':
2000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2001 width, precision, *f);
2002 if (longflag)
2003 numprinted = sprintf(numberresult, fmt,
2004 va_arg(count, long));
2005#ifdef HAVE_LONG_LONG
2006 else if (longlongflag)
2007 numprinted = sprintf(numberresult, fmt,
2008 va_arg(count, PY_LONG_LONG));
2009#endif
2010 else if (size_tflag)
2011 numprinted = sprintf(numberresult, fmt,
2012 va_arg(count, Py_ssize_t));
2013 else
2014 numprinted = sprintf(numberresult, fmt,
2015 va_arg(count, int));
2016 n += numprinted;
2017 /* advance by +1 to skip over the '\0' */
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
2023 break;
2024 case 'u':
2025 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2026 width, precision, 'u');
2027 if (longflag)
2028 numprinted = sprintf(numberresult, fmt,
2029 va_arg(count, unsigned long));
2030#ifdef HAVE_LONG_LONG
2031 else if (longlongflag)
2032 numprinted = sprintf(numberresult, fmt,
2033 va_arg(count, unsigned PY_LONG_LONG));
2034#endif
2035 else if (size_tflag)
2036 numprinted = sprintf(numberresult, fmt,
2037 va_arg(count, size_t));
2038 else
2039 numprinted = sprintf(numberresult, fmt,
2040 va_arg(count, unsigned int));
2041 n += numprinted;
2042 numberresult += (numprinted + 1);
2043 assert(*(numberresult - 1) == '\0');
2044 assert(*(numberresult - 2) != '\0');
2045 assert(numprinted >= 0);
2046 assert(numberresult <= numberresults + numbersize);
2047 break;
2048 case 'x':
2049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2050 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2051 n += numprinted;
2052 numberresult += (numprinted + 1);
2053 assert(*(numberresult - 1) == '\0');
2054 assert(*(numberresult - 2) != '\0');
2055 assert(numprinted >= 0);
2056 assert(numberresult <= numberresults + numbersize);
2057 break;
2058 case 'p':
2059 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2060 /* %p is ill-defined: ensure leading 0x. */
2061 if (numberresult[1] == 'X')
2062 numberresult[1] = 'x';
2063 else if (numberresult[1] != 'x') {
2064 memmove(numberresult + 2, numberresult,
2065 strlen(numberresult) + 1);
2066 numberresult[0] = '0';
2067 numberresult[1] = 'x';
2068 numprinted += 2;
2069 }
2070 n += numprinted;
2071 numberresult += (numprinted + 1);
2072 assert(*(numberresult - 1) == '\0');
2073 assert(*(numberresult - 2) != '\0');
2074 assert(numprinted >= 0);
2075 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 break;
2077 case 's':
2078 {
2079 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002080 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002081 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2082 if (!str)
2083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 /* since PyUnicode_DecodeUTF8 returns already flexible
2085 unicode objects, there is no need to call ready on them */
2086 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002087 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002089 /* Remember the str and switch to the next slot */
2090 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 break;
2092 }
2093 case 'U':
2094 {
2095 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002096 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (PyUnicode_READY(obj) == -1)
2098 goto fail;
2099 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002100 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 break;
2103 }
2104 case 'V':
2105 {
2106 PyObject *obj = va_arg(count, PyObject *);
2107 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002108 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002110 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002111 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 if (PyUnicode_READY(obj) == -1)
2113 goto fail;
2114 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002115 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002117 *callresult++ = NULL;
2118 }
2119 else {
2120 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2121 if (!str_obj)
2122 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002124 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002126 *callresult++ = str_obj;
2127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 break;
2129 }
2130 case 'S':
2131 {
2132 PyObject *obj = va_arg(count, PyObject *);
2133 PyObject *str;
2134 assert(obj);
2135 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002139 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 /* Remember the str and switch to the next slot */
2142 *callresult++ = str;
2143 break;
2144 }
2145 case 'R':
2146 {
2147 PyObject *obj = va_arg(count, PyObject *);
2148 PyObject *repr;
2149 assert(obj);
2150 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002154 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 /* Remember the repr and switch to the next slot */
2157 *callresult++ = repr;
2158 break;
2159 }
2160 case 'A':
2161 {
2162 PyObject *obj = va_arg(count, PyObject *);
2163 PyObject *ascii;
2164 assert(obj);
2165 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002169 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 /* Remember the repr and switch to the next slot */
2172 *callresult++ = ascii;
2173 break;
2174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002175 default:
2176 /* if we stumble upon an unknown
2177 formatting code, copy the rest of
2178 the format string to the output
2179 string. (we cannot just skip the
2180 code, since there's no way to know
2181 what's in the argument list) */
2182 n += strlen(p);
2183 goto expand;
2184 }
2185 } else
2186 n++;
2187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002191 we don't have to resize the string.
2192 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 if (!string)
2195 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 kind = PyUnicode_KIND(string);
2197 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002203 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002204
2205 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2207 /* checking for == because the last argument could be a empty
2208 string, which causes i to point to end, the assert at the end of
2209 the loop */
2210 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002211
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 switch (*f) {
2213 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002214 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 const int ordinal = va_arg(vargs, int);
2216 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002218 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 case 'p':
2224 /* unused, since we already have the result */
2225 if (*f == 'p')
2226 (void) va_arg(vargs, void *);
2227 else
2228 (void) va_arg(vargs, int);
2229 /* extract the result from numberresults and append. */
2230 for (; *numberresult; ++i, ++numberresult)
2231 PyUnicode_WRITE(kind, data, i, *numberresult);
2232 /* skip over the separating '\0' */
2233 assert(*numberresult == '\0');
2234 numberresult++;
2235 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 break;
2237 case 's':
2238 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002239 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002241 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 size = PyUnicode_GET_LENGTH(*callresult);
2243 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002244 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2245 *callresult, 0,
2246 size) < 0)
2247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002249 /* We're done with the unicode()/repr() => forget it */
2250 Py_DECREF(*callresult);
2251 /* switch to next unicode()/repr() result */
2252 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 break;
2254 }
2255 case 'U':
2256 {
2257 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 Py_ssize_t size;
2259 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2260 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002261 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2262 obj, 0,
2263 size) < 0)
2264 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 break;
2267 }
2268 case 'V':
2269 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002272 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 size = PyUnicode_GET_LENGTH(obj);
2275 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002276 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2277 obj, 0,
2278 size) < 0)
2279 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 size = PyUnicode_GET_LENGTH(*callresult);
2283 assert(PyUnicode_KIND(*callresult) <=
2284 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002285 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2286 *callresult,
2287 0, size) < 0)
2288 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002290 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002291 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002292 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 break;
2294 }
2295 case 'S':
2296 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002297 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 /* unused, since we already have the result */
2300 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002302 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2303 *callresult, 0,
2304 PyUnicode_GET_LENGTH(*callresult)) < 0)
2305 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 /* We're done with the unicode()/repr() => forget it */
2308 Py_DECREF(*callresult);
2309 /* switch to next unicode()/repr() result */
2310 ++callresult;
2311 break;
2312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 break;
2316 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 for (; *p; ++p, ++i)
2318 PyUnicode_WRITE(kind, data, i, *p);
2319 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 goto end;
2321 }
Victor Stinner1205f272010-09-11 00:54:47 +00002322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 else {
2324 assert(i < PyUnicode_GET_LENGTH(string));
2325 PyUnicode_WRITE(kind, data, i++, *f);
2326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002329
Benjamin Peterson29060642009-01-31 22:14:21 +00002330 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 if (callresults)
2332 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 if (numberresults)
2334 PyObject_Free(numberresults);
2335 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 if (callresults) {
2338 PyObject **callresult2 = callresults;
2339 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002340 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 ++callresult2;
2342 }
2343 PyObject_Free(callresults);
2344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (numberresults)
2346 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002348}
2349
Walter Dörwaldd2034312007-05-18 16:29:38 +00002350PyObject *
2351PyUnicode_FromFormat(const char *format, ...)
2352{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 PyObject* ret;
2354 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355
2356#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002358#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002360#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 ret = PyUnicode_FromFormatV(format, vargs);
2362 va_end(vargs);
2363 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002364}
2365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366#ifdef HAVE_WCHAR_H
2367
Victor Stinner5593d8a2010-10-02 11:11:27 +00002368/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2369 convert a Unicode object to a wide character string.
2370
Victor Stinnerd88d9832011-09-06 02:00:05 +02002371 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002372 character) required to convert the unicode object. Ignore size argument.
2373
Victor Stinnerd88d9832011-09-06 02:00:05 +02002374 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002375 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002376 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002377static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002378unicode_aswidechar(PyUnicodeObject *unicode,
2379 wchar_t *w,
2380 Py_ssize_t size)
2381{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002382 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 const wchar_t *wstr;
2384
2385 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2386 if (wstr == NULL)
2387 return -1;
2388
Victor Stinner5593d8a2010-10-02 11:11:27 +00002389 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002390 if (size > res)
2391 size = res + 1;
2392 else
2393 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002395 return res;
2396 }
2397 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002399}
2400
2401Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002402PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002403 wchar_t *w,
2404 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405{
2406 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 PyErr_BadInternalCall();
2408 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002410 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411}
2412
Victor Stinner137c34c2010-09-29 10:25:54 +00002413wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002414PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002415 Py_ssize_t *size)
2416{
2417 wchar_t* buffer;
2418 Py_ssize_t buflen;
2419
2420 if (unicode == NULL) {
2421 PyErr_BadInternalCall();
2422 return NULL;
2423 }
2424
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002425 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (buflen == -1)
2427 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002428 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002429 PyErr_NoMemory();
2430 return NULL;
2431 }
2432
Victor Stinner137c34c2010-09-29 10:25:54 +00002433 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2434 if (buffer == NULL) {
2435 PyErr_NoMemory();
2436 return NULL;
2437 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002438 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (buflen == -1)
2440 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441 if (size != NULL)
2442 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002443 return buffer;
2444}
2445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447
Alexander Belopolsky40018472011-02-26 01:02:56 +00002448PyObject *
2449PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002452 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 PyErr_SetString(PyExc_ValueError,
2454 "chr() arg not in range(0x110000)");
2455 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002456 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (ordinal < 256)
2459 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 v = PyUnicode_New(1, ordinal);
2462 if (v == NULL)
2463 return NULL;
2464 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2465 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002466}
2467
Alexander Belopolsky40018472011-02-26 01:02:56 +00002468PyObject *
2469PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002471 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002473 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002474 if (PyUnicode_READY(obj))
2475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 Py_INCREF(obj);
2477 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002478 }
2479 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 /* For a Unicode subtype that's not a Unicode object,
2481 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002482 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002483 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002484 PyErr_Format(PyExc_TypeError,
2485 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002486 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002487 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002488}
2489
Alexander Belopolsky40018472011-02-26 01:02:56 +00002490PyObject *
2491PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002492 const char *encoding,
2493 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002494{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002495 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002496 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 PyErr_BadInternalCall();
2500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002502
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002503 /* Decoding bytes objects is the most common case and should be fast */
2504 if (PyBytes_Check(obj)) {
2505 if (PyBytes_GET_SIZE(obj) == 0) {
2506 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002507 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002508 }
2509 else {
2510 v = PyUnicode_Decode(
2511 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2512 encoding, errors);
2513 }
2514 return v;
2515 }
2516
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002517 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 PyErr_SetString(PyExc_TypeError,
2519 "decoding str is not supported");
2520 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002522
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002523 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2524 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2525 PyErr_Format(PyExc_TypeError,
2526 "coercing to str: need bytes, bytearray "
2527 "or buffer-like object, %.80s found",
2528 Py_TYPE(obj)->tp_name);
2529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002530 }
Tim Petersced69f82003-09-16 20:30:58 +00002531
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002532 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002534 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
Tim Petersced69f82003-09-16 20:30:58 +00002536 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002537 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002538
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002539 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002540 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541}
2542
Victor Stinner600d3be2010-06-10 12:00:55 +00002543/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002544 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2545 1 on success. */
2546static int
2547normalize_encoding(const char *encoding,
2548 char *lower,
2549 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002551 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002552 char *l;
2553 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002555 e = encoding;
2556 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002557 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002558 while (*e) {
2559 if (l == l_end)
2560 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002561 if (Py_ISUPPER(*e)) {
2562 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002563 }
2564 else if (*e == '_') {
2565 *l++ = '-';
2566 e++;
2567 }
2568 else {
2569 *l++ = *e++;
2570 }
2571 }
2572 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002573 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002574}
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002578 Py_ssize_t size,
2579 const char *encoding,
2580 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002581{
2582 PyObject *buffer = NULL, *unicode;
2583 Py_buffer info;
2584 char lower[11]; /* Enough for any encoding shortcut */
2585
2586 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002588
2589 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002590 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002591 if ((strcmp(lower, "utf-8") == 0) ||
2592 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002593 return PyUnicode_DecodeUTF8(s, size, errors);
2594 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002595 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002596 (strcmp(lower, "iso-8859-1") == 0))
2597 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002598#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002599 else if (strcmp(lower, "mbcs") == 0)
2600 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002601#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002602 else if (strcmp(lower, "ascii") == 0)
2603 return PyUnicode_DecodeASCII(s, size, errors);
2604 else if (strcmp(lower, "utf-16") == 0)
2605 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2606 else if (strcmp(lower, "utf-32") == 0)
2607 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002611 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002612 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002613 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002614 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 if (buffer == NULL)
2616 goto onError;
2617 unicode = PyCodec_Decode(buffer, encoding, errors);
2618 if (unicode == NULL)
2619 goto onError;
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002622 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002623 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 Py_DECREF(unicode);
2625 goto onError;
2626 }
2627 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002628 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 Py_DECREF(unicode);
2630 return NULL;
2631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002633
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 Py_XDECREF(buffer);
2636 return NULL;
2637}
2638
Alexander Belopolsky40018472011-02-26 01:02:56 +00002639PyObject *
2640PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002641 const char *encoding,
2642 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002643{
2644 PyObject *v;
2645
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 goto onError;
2649 }
2650
2651 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002653
2654 /* Decode via the codec registry */
2655 v = PyCodec_Decode(unicode, encoding, errors);
2656 if (v == NULL)
2657 goto onError;
2658 return v;
2659
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002661 return NULL;
2662}
2663
Alexander Belopolsky40018472011-02-26 01:02:56 +00002664PyObject *
2665PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002666 const char *encoding,
2667 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002668{
2669 PyObject *v;
2670
2671 if (!PyUnicode_Check(unicode)) {
2672 PyErr_BadArgument();
2673 goto onError;
2674 }
2675
2676 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002678
2679 /* Decode via the codec registry */
2680 v = PyCodec_Decode(unicode, encoding, errors);
2681 if (v == NULL)
2682 goto onError;
2683 if (!PyUnicode_Check(v)) {
2684 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002685 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002686 Py_TYPE(v)->tp_name);
2687 Py_DECREF(v);
2688 goto onError;
2689 }
2690 return v;
2691
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002693 return NULL;
2694}
2695
Alexander Belopolsky40018472011-02-26 01:02:56 +00002696PyObject *
2697PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002698 Py_ssize_t size,
2699 const char *encoding,
2700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701{
2702 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 unicode = PyUnicode_FromUnicode(s, size);
2705 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2708 Py_DECREF(unicode);
2709 return v;
2710}
2711
Alexander Belopolsky40018472011-02-26 01:02:56 +00002712PyObject *
2713PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002714 const char *encoding,
2715 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002716{
2717 PyObject *v;
2718
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 goto onError;
2722 }
2723
2724 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002725 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002726
2727 /* Encode via the codec registry */
2728 v = PyCodec_Encode(unicode, encoding, errors);
2729 if (v == NULL)
2730 goto onError;
2731 return v;
2732
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002734 return NULL;
2735}
2736
Victor Stinnerad158722010-10-27 00:25:46 +00002737PyObject *
2738PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002739{
Victor Stinner99b95382011-07-04 14:23:54 +02002740#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2742 PyUnicode_GET_SIZE(unicode),
2743 NULL);
2744#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002746#else
Victor Stinner793b5312011-04-27 00:24:21 +02002747 PyInterpreterState *interp = PyThreadState_GET()->interp;
2748 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2749 cannot use it to encode and decode filenames before it is loaded. Load
2750 the Python codec requires to encode at least its own filename. Use the C
2751 version of the locale codec until the codec registry is initialized and
2752 the Python codec is loaded.
2753
2754 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2755 cannot only rely on it: check also interp->fscodec_initialized for
2756 subinterpreters. */
2757 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002758 return PyUnicode_AsEncodedString(unicode,
2759 Py_FileSystemDefaultEncoding,
2760 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002761 }
2762 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002763 /* locale encoding with surrogateescape */
2764 wchar_t *wchar;
2765 char *bytes;
2766 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002767 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002768
2769 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2770 if (wchar == NULL)
2771 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002772 bytes = _Py_wchar2char(wchar, &error_pos);
2773 if (bytes == NULL) {
2774 if (error_pos != (size_t)-1) {
2775 char *errmsg = strerror(errno);
2776 PyObject *exc = NULL;
2777 if (errmsg == NULL)
2778 errmsg = "Py_wchar2char() failed";
2779 raise_encode_exception(&exc,
2780 "filesystemencoding",
2781 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2782 error_pos, error_pos+1,
2783 errmsg);
2784 Py_XDECREF(exc);
2785 }
2786 else
2787 PyErr_NoMemory();
2788 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002789 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002790 }
2791 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002792
2793 bytes_obj = PyBytes_FromString(bytes);
2794 PyMem_Free(bytes);
2795 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002796 }
Victor Stinnerad158722010-10-27 00:25:46 +00002797#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002798}
2799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002802 const char *encoding,
2803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804{
2805 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002806 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002807
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 if (!PyUnicode_Check(unicode)) {
2809 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Fred Drakee4315f52000-05-09 19:53:39 +00002812
Victor Stinner2f283c22011-03-02 01:21:46 +00002813 if (encoding == NULL) {
2814 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002816 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002818 }
Fred Drakee4315f52000-05-09 19:53:39 +00002819
2820 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002821 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002822 if ((strcmp(lower, "utf-8") == 0) ||
2823 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002824 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002825 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002827 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002829 }
Victor Stinner37296e82010-06-10 13:36:23 +00002830 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002831 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002832 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002834#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002835 else if (strcmp(lower, "mbcs") == 0)
2836 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2837 PyUnicode_GET_SIZE(unicode),
2838 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002839#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002840 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843
2844 /* Encode via the codec registry */
2845 v = PyCodec_Encode(unicode, encoding, errors);
2846 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002847 return NULL;
2848
2849 /* The normal path */
2850 if (PyBytes_Check(v))
2851 return v;
2852
2853 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002854 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002855 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002856 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002857
2858 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2859 "encoder %s returned bytearray instead of bytes",
2860 encoding);
2861 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002862 Py_DECREF(v);
2863 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002864 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002865
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002866 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2867 Py_DECREF(v);
2868 return b;
2869 }
2870
2871 PyErr_Format(PyExc_TypeError,
2872 "encoder did not return a bytes object (type=%.400s)",
2873 Py_TYPE(v)->tp_name);
2874 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002875 return NULL;
2876}
2877
Alexander Belopolsky40018472011-02-26 01:02:56 +00002878PyObject *
2879PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002880 const char *encoding,
2881 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002882{
2883 PyObject *v;
2884
2885 if (!PyUnicode_Check(unicode)) {
2886 PyErr_BadArgument();
2887 goto onError;
2888 }
2889
2890 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002892
2893 /* Encode via the codec registry */
2894 v = PyCodec_Encode(unicode, encoding, errors);
2895 if (v == NULL)
2896 goto onError;
2897 if (!PyUnicode_Check(v)) {
2898 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002900 Py_TYPE(v)->tp_name);
2901 Py_DECREF(v);
2902 goto onError;
2903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002905
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 return NULL;
2908}
2909
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002910PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002911PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002912 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002913 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2914}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002915
Christian Heimes5894ba72007-11-04 11:43:14 +00002916PyObject*
2917PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2918{
Victor Stinner99b95382011-07-04 14:23:54 +02002919#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002920 return PyUnicode_DecodeMBCS(s, size, NULL);
2921#elif defined(__APPLE__)
2922 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2923#else
Victor Stinner793b5312011-04-27 00:24:21 +02002924 PyInterpreterState *interp = PyThreadState_GET()->interp;
2925 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2926 cannot use it to encode and decode filenames before it is loaded. Load
2927 the Python codec requires to encode at least its own filename. Use the C
2928 version of the locale codec until the codec registry is initialized and
2929 the Python codec is loaded.
2930
2931 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2932 cannot only rely on it: check also interp->fscodec_initialized for
2933 subinterpreters. */
2934 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002935 return PyUnicode_Decode(s, size,
2936 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002937 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002938 }
2939 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002940 /* locale encoding with surrogateescape */
2941 wchar_t *wchar;
2942 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002943 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002944
2945 if (s[size] != '\0' || size != strlen(s)) {
2946 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2947 return NULL;
2948 }
2949
Victor Stinner168e1172010-10-16 23:16:16 +00002950 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002951 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002952 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002953
Victor Stinner168e1172010-10-16 23:16:16 +00002954 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002955 PyMem_Free(wchar);
2956 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002957 }
Victor Stinnerad158722010-10-27 00:25:46 +00002958#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002959}
2960
Martin v. Löwis011e8422009-05-05 04:43:17 +00002961
2962int
2963PyUnicode_FSConverter(PyObject* arg, void* addr)
2964{
2965 PyObject *output = NULL;
2966 Py_ssize_t size;
2967 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002968 if (arg == NULL) {
2969 Py_DECREF(*(PyObject**)addr);
2970 return 1;
2971 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002972 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002973 output = arg;
2974 Py_INCREF(output);
2975 }
2976 else {
2977 arg = PyUnicode_FromObject(arg);
2978 if (!arg)
2979 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002980 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002981 Py_DECREF(arg);
2982 if (!output)
2983 return 0;
2984 if (!PyBytes_Check(output)) {
2985 Py_DECREF(output);
2986 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2987 return 0;
2988 }
2989 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002990 size = PyBytes_GET_SIZE(output);
2991 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002992 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002993 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002994 Py_DECREF(output);
2995 return 0;
2996 }
2997 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002998 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002999}
3000
3001
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003002int
3003PyUnicode_FSDecoder(PyObject* arg, void* addr)
3004{
3005 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003006 if (arg == NULL) {
3007 Py_DECREF(*(PyObject**)addr);
3008 return 1;
3009 }
3010 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003011 if (PyUnicode_READY(arg))
3012 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003013 output = arg;
3014 Py_INCREF(output);
3015 }
3016 else {
3017 arg = PyBytes_FromObject(arg);
3018 if (!arg)
3019 return 0;
3020 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3021 PyBytes_GET_SIZE(arg));
3022 Py_DECREF(arg);
3023 if (!output)
3024 return 0;
3025 if (!PyUnicode_Check(output)) {
3026 Py_DECREF(output);
3027 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3028 return 0;
3029 }
3030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3032 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003033 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3034 Py_DECREF(output);
3035 return 0;
3036 }
3037 *(PyObject**)addr = output;
3038 return Py_CLEANUP_SUPPORTED;
3039}
3040
3041
Martin v. Löwis5b222132007-06-10 09:51:05 +00003042char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003043PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003044{
Christian Heimesf3863112007-11-22 07:46:41 +00003045 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003046 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3047
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_BadArgument();
3050 return NULL;
3051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003052 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003055 if (PyUnicode_UTF8(unicode) == NULL) {
3056 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3058 if (bytes == NULL)
3059 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003060 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3061 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 Py_DECREF(bytes);
3063 return NULL;
3064 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003065 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3066 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003067 Py_DECREF(bytes);
3068 }
3069
3070 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003071 *psize = PyUnicode_UTF8_LENGTH(unicode);
3072 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003073}
3074
3075char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003078 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3079}
3080
3081#ifdef Py_DEBUG
3082int unicode_as_unicode_calls = 0;
3083#endif
3084
3085
3086Py_UNICODE *
3087PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3088{
3089 PyUnicodeObject *u;
3090 const unsigned char *one_byte;
3091#if SIZEOF_WCHAR_T == 4
3092 const Py_UCS2 *two_bytes;
3093#else
3094 const Py_UCS4 *four_bytes;
3095 const Py_UCS4 *ucs4_end;
3096 Py_ssize_t num_surrogates;
3097#endif
3098 wchar_t *w;
3099 wchar_t *wchar_end;
3100
3101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 return NULL;
3104 }
3105 u = (PyUnicodeObject*)unicode;
3106 if (_PyUnicode_WSTR(u) == NULL) {
3107 /* Non-ASCII compact unicode object */
3108 assert(_PyUnicode_KIND(u) != 0);
3109 assert(PyUnicode_IS_READY(u));
3110
3111#ifdef Py_DEBUG
3112 ++unicode_as_unicode_calls;
3113#endif
3114
3115 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3116#if SIZEOF_WCHAR_T == 2
3117 four_bytes = PyUnicode_4BYTE_DATA(u);
3118 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3119 num_surrogates = 0;
3120
3121 for (; four_bytes < ucs4_end; ++four_bytes) {
3122 if (*four_bytes > 0xFFFF)
3123 ++num_surrogates;
3124 }
3125
3126 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3127 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3128 if (!_PyUnicode_WSTR(u)) {
3129 PyErr_NoMemory();
3130 return NULL;
3131 }
3132 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3133
3134 w = _PyUnicode_WSTR(u);
3135 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3136 four_bytes = PyUnicode_4BYTE_DATA(u);
3137 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3138 if (*four_bytes > 0xFFFF) {
3139 /* encode surrogate pair in this case */
3140 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3141 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3142 }
3143 else
3144 *w = *four_bytes;
3145
3146 if (w > wchar_end) {
3147 assert(0 && "Miscalculated string end");
3148 }
3149 }
3150 *w = 0;
3151#else
3152 /* sizeof(wchar_t) == 4 */
3153 Py_FatalError("Impossible unicode object state, wstr and str "
3154 "should share memory already.");
3155 return NULL;
3156#endif
3157 }
3158 else {
3159 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3160 (_PyUnicode_LENGTH(u) + 1));
3161 if (!_PyUnicode_WSTR(u)) {
3162 PyErr_NoMemory();
3163 return NULL;
3164 }
3165 if (!PyUnicode_IS_COMPACT_ASCII(u))
3166 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3167 w = _PyUnicode_WSTR(u);
3168 wchar_end = w + _PyUnicode_LENGTH(u);
3169
3170 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3171 one_byte = PyUnicode_1BYTE_DATA(u);
3172 for (; w < wchar_end; ++one_byte, ++w)
3173 *w = *one_byte;
3174 /* null-terminate the wstr */
3175 *w = 0;
3176 }
3177 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3178#if SIZEOF_WCHAR_T == 4
3179 two_bytes = PyUnicode_2BYTE_DATA(u);
3180 for (; w < wchar_end; ++two_bytes, ++w)
3181 *w = *two_bytes;
3182 /* null-terminate the wstr */
3183 *w = 0;
3184#else
3185 /* sizeof(wchar_t) == 2 */
3186 PyObject_FREE(_PyUnicode_WSTR(u));
3187 _PyUnicode_WSTR(u) = NULL;
3188 Py_FatalError("Impossible unicode object state, wstr "
3189 "and str should share memory already.");
3190 return NULL;
3191#endif
3192 }
3193 else {
3194 assert(0 && "This should never happen.");
3195 }
3196 }
3197 }
3198 if (size != NULL)
3199 *size = PyUnicode_WSTR_LENGTH(u);
3200 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003201}
3202
Alexander Belopolsky40018472011-02-26 01:02:56 +00003203Py_UNICODE *
3204PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207}
3208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003209
Alexander Belopolsky40018472011-02-26 01:02:56 +00003210Py_ssize_t
3211PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212{
3213 if (!PyUnicode_Check(unicode)) {
3214 PyErr_BadArgument();
3215 goto onError;
3216 }
3217 return PyUnicode_GET_SIZE(unicode);
3218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 return -1;
3221}
3222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003223Py_ssize_t
3224PyUnicode_GetLength(PyObject *unicode)
3225{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003226 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003227 PyErr_BadArgument();
3228 return -1;
3229 }
3230
3231 return PyUnicode_GET_LENGTH(unicode);
3232}
3233
3234Py_UCS4
3235PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3236{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003237 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3238 PyErr_BadArgument();
3239 return (Py_UCS4)-1;
3240 }
3241 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3242 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003243 return (Py_UCS4)-1;
3244 }
3245 return PyUnicode_READ_CHAR(unicode, index);
3246}
3247
3248int
3249PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3250{
3251 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003252 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253 return -1;
3254 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003255 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3256 PyErr_SetString(PyExc_IndexError, "string index out of range");
3257 return -1;
3258 }
3259 if (_PyUnicode_Dirty(unicode))
3260 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003261 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3262 index, ch);
3263 return 0;
3264}
3265
Alexander Belopolsky40018472011-02-26 01:02:56 +00003266const char *
3267PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003268{
Victor Stinner42cb4622010-09-01 19:39:01 +00003269 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003270}
3271
Victor Stinner554f3f02010-06-16 23:33:54 +00003272/* create or adjust a UnicodeDecodeError */
3273static void
3274make_decode_exception(PyObject **exceptionObject,
3275 const char *encoding,
3276 const char *input, Py_ssize_t length,
3277 Py_ssize_t startpos, Py_ssize_t endpos,
3278 const char *reason)
3279{
3280 if (*exceptionObject == NULL) {
3281 *exceptionObject = PyUnicodeDecodeError_Create(
3282 encoding, input, length, startpos, endpos, reason);
3283 }
3284 else {
3285 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3286 goto onError;
3287 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3288 goto onError;
3289 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3290 goto onError;
3291 }
3292 return;
3293
3294onError:
3295 Py_DECREF(*exceptionObject);
3296 *exceptionObject = NULL;
3297}
3298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299/* error handling callback helper:
3300 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003301 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 and adjust various state variables.
3303 return 0 on success, -1 on error
3304*/
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306static int
3307unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 const char *encoding, const char *reason,
3309 const char **input, const char **inend, Py_ssize_t *startinpos,
3310 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3311 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003313 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314
3315 PyObject *restuple = NULL;
3316 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003318 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t requiredsize;
3320 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003322 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 int res = -1;
3325
3326 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 *errorHandler = PyCodec_LookupError(errors);
3328 if (*errorHandler == NULL)
3329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 }
3331
Victor Stinner554f3f02010-06-16 23:33:54 +00003332 make_decode_exception(exceptionObject,
3333 encoding,
3334 *input, *inend - *input,
3335 *startinpos, *endinpos,
3336 reason);
3337 if (*exceptionObject == NULL)
3338 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339
3340 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3341 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003344 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 }
3347 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003349
3350 /* Copy back the bytes variables, which might have been modified by the
3351 callback */
3352 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3353 if (!inputobj)
3354 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003355 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003356 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003358 *input = PyBytes_AS_STRING(inputobj);
3359 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003360 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003361 /* we can DECREF safely, as the exception has another reference,
3362 so the object won't go away. */
3363 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003367 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3369 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371
3372 /* need more space? (at least enough for what we
3373 have+the replacement+the rest of the string (starting
3374 at the new input position), so we won't have to check space
3375 when there are no errors in the rest of the string) */
3376 repptr = PyUnicode_AS_UNICODE(repunicode);
3377 repsize = PyUnicode_GET_SIZE(repunicode);
3378 requiredsize = *outpos + repsize + insize-newpos;
3379 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 if (requiredsize<2*outsize)
3381 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003382 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 goto onError;
3384 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 }
3386 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003387 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 Py_UNICODE_COPY(*outptr, repptr, repsize);
3389 *outptr += repsize;
3390 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 /* we made it! */
3393 res = 0;
3394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 Py_XDECREF(restuple);
3397 return res;
3398}
3399
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003400/* --- UTF-7 Codec -------------------------------------------------------- */
3401
Antoine Pitrou244651a2009-05-04 18:56:13 +00003402/* See RFC2152 for details. We encode conservatively and decode liberally. */
3403
3404/* Three simple macros defining base-64. */
3405
3406/* Is c a base-64 character? */
3407
3408#define IS_BASE64(c) \
3409 (((c) >= 'A' && (c) <= 'Z') || \
3410 ((c) >= 'a' && (c) <= 'z') || \
3411 ((c) >= '0' && (c) <= '9') || \
3412 (c) == '+' || (c) == '/')
3413
3414/* given that c is a base-64 character, what is its base-64 value? */
3415
3416#define FROM_BASE64(c) \
3417 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3418 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3419 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3420 (c) == '+' ? 62 : 63)
3421
3422/* What is the base-64 character of the bottom 6 bits of n? */
3423
3424#define TO_BASE64(n) \
3425 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3426
3427/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3428 * decoded as itself. We are permissive on decoding; the only ASCII
3429 * byte not decoding to itself is the + which begins a base64
3430 * string. */
3431
3432#define DECODE_DIRECT(c) \
3433 ((c) <= 127 && (c) != '+')
3434
3435/* The UTF-7 encoder treats ASCII characters differently according to
3436 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3437 * the above). See RFC2152. This array identifies these different
3438 * sets:
3439 * 0 : "Set D"
3440 * alphanumeric and '(),-./:?
3441 * 1 : "Set O"
3442 * !"#$%&*;<=>@[]^_`{|}
3443 * 2 : "whitespace"
3444 * ht nl cr sp
3445 * 3 : special (must be base64 encoded)
3446 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3447 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003448
Tim Petersced69f82003-09-16 20:30:58 +00003449static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003450char utf7_category[128] = {
3451/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3452 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3453/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3454 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3455/* sp ! " # $ % & ' ( ) * + , - . / */
3456 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3457/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3459/* @ A B C D E F G H I J K L M N O */
3460 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3463/* ` a b c d e f g h i j k l m n o */
3464 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465/* p q r s t u v w x y z { | } ~ del */
3466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003467};
3468
Antoine Pitrou244651a2009-05-04 18:56:13 +00003469/* ENCODE_DIRECT: this character should be encoded as itself. The
3470 * answer depends on whether we are encoding set O as itself, and also
3471 * on whether we are encoding whitespace as itself. RFC2152 makes it
3472 * clear that the answers to these questions vary between
3473 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003474
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475#define ENCODE_DIRECT(c, directO, directWS) \
3476 ((c) < 128 && (c) > 0 && \
3477 ((utf7_category[(c)] == 0) || \
3478 (directWS && (utf7_category[(c)] == 2)) || \
3479 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003480
Alexander Belopolsky40018472011-02-26 01:02:56 +00003481PyObject *
3482PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003483 Py_ssize_t size,
3484 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003485{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003486 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3487}
3488
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489/* The decoder. The only state we preserve is our read position,
3490 * i.e. how many characters we have consumed. So if we end in the
3491 * middle of a shift sequence we have to back off the read position
3492 * and the output to the beginning of the sequence, otherwise we lose
3493 * all the shift state (seen bits, number of bits seen, high
3494 * surrogate). */
3495
Alexander Belopolsky40018472011-02-26 01:02:56 +00003496PyObject *
3497PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003498 Py_ssize_t size,
3499 const char *errors,
3500 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t startinpos;
3504 Py_ssize_t endinpos;
3505 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003506 const char *e;
3507 PyUnicodeObject *unicode;
3508 Py_UNICODE *p;
3509 const char *errmsg = "";
3510 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511 Py_UNICODE *shiftOutStart;
3512 unsigned int base64bits = 0;
3513 unsigned long base64buffer = 0;
3514 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 PyObject *errorHandler = NULL;
3516 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003517
3518 unicode = _PyUnicode_New(size);
3519 if (!unicode)
3520 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003521 if (size == 0) {
3522 if (consumed)
3523 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003524 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003528 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003529 e = s + size;
3530
3531 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003534 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003535
Antoine Pitrou244651a2009-05-04 18:56:13 +00003536 if (inShift) { /* in a base-64 section */
3537 if (IS_BASE64(ch)) { /* consume a base-64 character */
3538 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3539 base64bits += 6;
3540 s++;
3541 if (base64bits >= 16) {
3542 /* we have enough bits for a UTF-16 value */
3543 Py_UNICODE outCh = (Py_UNICODE)
3544 (base64buffer >> (base64bits-16));
3545 base64bits -= 16;
3546 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3547 if (surrogate) {
3548 /* expecting a second surrogate */
3549 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3550#ifdef Py_UNICODE_WIDE
3551 *p++ = (((surrogate & 0x3FF)<<10)
3552 | (outCh & 0x3FF)) + 0x10000;
3553#else
3554 *p++ = surrogate;
3555 *p++ = outCh;
3556#endif
3557 surrogate = 0;
3558 }
3559 else {
3560 surrogate = 0;
3561 errmsg = "second surrogate missing";
3562 goto utf7Error;
3563 }
3564 }
3565 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3566 /* first surrogate */
3567 surrogate = outCh;
3568 }
3569 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3570 errmsg = "unexpected second surrogate";
3571 goto utf7Error;
3572 }
3573 else {
3574 *p++ = outCh;
3575 }
3576 }
3577 }
3578 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003579 inShift = 0;
3580 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003581 if (surrogate) {
3582 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003583 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003585 if (base64bits > 0) { /* left-over bits */
3586 if (base64bits >= 6) {
3587 /* We've seen at least one base-64 character */
3588 errmsg = "partial character in shift sequence";
3589 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003591 else {
3592 /* Some bits remain; they should be zero */
3593 if (base64buffer != 0) {
3594 errmsg = "non-zero padding bits in shift sequence";
3595 goto utf7Error;
3596 }
3597 }
3598 }
3599 if (ch != '-') {
3600 /* '-' is absorbed; other terminating
3601 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003602 *p++ = ch;
3603 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 }
3605 }
3606 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003608 s++; /* consume '+' */
3609 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 s++;
3611 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003612 }
3613 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 shiftOutStart = p;
3616 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003617 }
3618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 *p++ = ch;
3621 s++;
3622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 else {
3624 startinpos = s-starts;
3625 s++;
3626 errmsg = "unexpected special character";
3627 goto utf7Error;
3628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003629 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003630utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 outpos = p-PyUnicode_AS_UNICODE(unicode);
3632 endinpos = s-starts;
3633 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 errors, &errorHandler,
3635 "utf7", errmsg,
3636 &starts, &e, &startinpos, &endinpos, &exc, &s,
3637 &unicode, &outpos, &p))
3638 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003639 }
3640
Antoine Pitrou244651a2009-05-04 18:56:13 +00003641 /* end of string */
3642
3643 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3644 /* if we're in an inconsistent state, that's an error */
3645 if (surrogate ||
3646 (base64bits >= 6) ||
3647 (base64bits > 0 && base64buffer != 0)) {
3648 outpos = p-PyUnicode_AS_UNICODE(unicode);
3649 endinpos = size;
3650 if (unicode_decode_call_errorhandler(
3651 errors, &errorHandler,
3652 "utf7", "unterminated shift sequence",
3653 &starts, &e, &startinpos, &endinpos, &exc, &s,
3654 &unicode, &outpos, &p))
3655 goto onError;
3656 if (s < e)
3657 goto restart;
3658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660
3661 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003662 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003663 if (inShift) {
3664 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003665 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003666 }
3667 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003668 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003670 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671
Victor Stinnerfe226c02011-10-03 03:52:20 +02003672 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673 goto onError;
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003677 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678 Py_DECREF(unicode);
3679 return NULL;
3680 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003681 return (PyObject *)unicode;
3682
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003686 Py_DECREF(unicode);
3687 return NULL;
3688}
3689
3690
Alexander Belopolsky40018472011-02-26 01:02:56 +00003691PyObject *
3692PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003693 Py_ssize_t size,
3694 int base64SetO,
3695 int base64WhiteSpace,
3696 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003698 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003700 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003702 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003703 unsigned int base64bits = 0;
3704 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003705 char * out;
3706 char * start;
3707
3708 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003710
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003711 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003712 return PyErr_NoMemory();
3713
Antoine Pitrou244651a2009-05-04 18:56:13 +00003714 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003715 if (v == NULL)
3716 return NULL;
3717
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003718 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719 for (;i < size; ++i) {
3720 Py_UNICODE ch = s[i];
3721
Antoine Pitrou244651a2009-05-04 18:56:13 +00003722 if (inShift) {
3723 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3724 /* shifting out */
3725 if (base64bits) { /* output remaining bits */
3726 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3727 base64buffer = 0;
3728 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 }
3730 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003731 /* Characters not in the BASE64 set implicitly unshift the sequence
3732 so no '-' is required, except if the character is itself a '-' */
3733 if (IS_BASE64(ch) || ch == '-') {
3734 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003735 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003736 *out++ = (char) ch;
3737 }
3738 else {
3739 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003740 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003741 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 else { /* not in a shift sequence */
3743 if (ch == '+') {
3744 *out++ = '+';
3745 *out++ = '-';
3746 }
3747 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3748 *out++ = (char) ch;
3749 }
3750 else {
3751 *out++ = '+';
3752 inShift = 1;
3753 goto encode_char;
3754 }
3755 }
3756 continue;
3757encode_char:
3758#ifdef Py_UNICODE_WIDE
3759 if (ch >= 0x10000) {
3760 /* code first surrogate */
3761 base64bits += 16;
3762 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3763 while (base64bits >= 6) {
3764 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3765 base64bits -= 6;
3766 }
3767 /* prepare second surrogate */
3768 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3769 }
3770#endif
3771 base64bits += 16;
3772 base64buffer = (base64buffer << 16) | ch;
3773 while (base64bits >= 6) {
3774 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3775 base64bits -= 6;
3776 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003777 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778 if (base64bits)
3779 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3780 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003782 if (_PyBytes_Resize(&v, out - start) < 0)
3783 return NULL;
3784 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785}
3786
Antoine Pitrou244651a2009-05-04 18:56:13 +00003787#undef IS_BASE64
3788#undef FROM_BASE64
3789#undef TO_BASE64
3790#undef DECODE_DIRECT
3791#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793/* --- UTF-8 Codec -------------------------------------------------------- */
3794
Tim Petersced69f82003-09-16 20:30:58 +00003795static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003797 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3798 illegal prefix. See RFC 3629 for details */
3799 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3800 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003801 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3803 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3804 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3805 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003806 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3807 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003810 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3811 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3812 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3813 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3814 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815};
3816
Alexander Belopolsky40018472011-02-26 01:02:56 +00003817PyObject *
3818PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003819 Py_ssize_t size,
3820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald69652032004-09-07 20:24:22 +00003822 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3823}
3824
Antoine Pitrouab868312009-01-10 15:40:25 +00003825/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3826#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3827
3828/* Mask to quickly check whether a C 'long' contains a
3829 non-ASCII, UTF8-encoded char. */
3830#if (SIZEOF_LONG == 8)
3831# define ASCII_CHAR_MASK 0x8080808080808080L
3832#elif (SIZEOF_LONG == 4)
3833# define ASCII_CHAR_MASK 0x80808080L
3834#else
3835# error C 'long' size should be either 4 or 8!
3836#endif
3837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838/* Scans a UTF-8 string and returns the maximum character to be expected,
3839 the size of the decoded unicode string and if any major errors were
3840 encountered.
3841
3842 This function does check basic UTF-8 sanity, it does however NOT CHECK
3843 if the string contains surrogates, and if all continuation bytes are
3844 within the correct ranges, these checks are performed in
3845 PyUnicode_DecodeUTF8Stateful.
3846
3847 If it sets has_errors to 1, it means the value of unicode_size and max_char
3848 will be bogus and you should not rely on useful information in them.
3849 */
3850static Py_UCS4
3851utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3852 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3853 int *has_errors)
3854{
3855 Py_ssize_t n;
3856 Py_ssize_t char_count = 0;
3857 Py_UCS4 max_char = 127, new_max;
3858 Py_UCS4 upper_bound;
3859 const unsigned char *p = (const unsigned char *)s;
3860 const unsigned char *end = p + string_size;
3861 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3862 int err = 0;
3863
3864 for (; p < end && !err; ++p, ++char_count) {
3865 /* Only check value if it's not a ASCII char... */
3866 if (*p < 0x80) {
3867 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3868 an explanation. */
3869 if (!((size_t) p & LONG_PTR_MASK)) {
3870 /* Help register allocation */
3871 register const unsigned char *_p = p;
3872 while (_p < aligned_end) {
3873 unsigned long value = *(unsigned long *) _p;
3874 if (value & ASCII_CHAR_MASK)
3875 break;
3876 _p += SIZEOF_LONG;
3877 char_count += SIZEOF_LONG;
3878 }
3879 p = _p;
3880 if (p == end)
3881 break;
3882 }
3883 }
3884 if (*p >= 0x80) {
3885 n = utf8_code_length[*p];
3886 new_max = max_char;
3887 switch (n) {
3888 /* invalid start byte */
3889 case 0:
3890 err = 1;
3891 break;
3892 case 2:
3893 /* Code points between 0x00FF and 0x07FF inclusive.
3894 Approximate the upper bound of the code point,
3895 if this flips over 255 we can be sure it will be more
3896 than 255 and the string will need 2 bytes per code coint,
3897 if it stays under or equal to 255, we can be sure 1 byte
3898 is enough.
3899 ((*p & 0b00011111) << 6) | 0b00111111 */
3900 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3901 if (max_char < upper_bound)
3902 new_max = upper_bound;
3903 /* Ensure we track at least that we left ASCII space. */
3904 if (new_max < 128)
3905 new_max = 128;
3906 break;
3907 case 3:
3908 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3909 always > 255 and <= 65535 and will always need 2 bytes. */
3910 if (max_char < 65535)
3911 new_max = 65535;
3912 break;
3913 case 4:
3914 /* Code point will be above 0xFFFF for sure in this case. */
3915 new_max = 65537;
3916 break;
3917 /* Internal error, this should be caught by the first if */
3918 case 1:
3919 default:
3920 assert(0 && "Impossible case in utf8_max_char_and_size");
3921 err = 1;
3922 }
3923 /* Instead of number of overall bytes for this code point,
3924 n containts the number of following bytes: */
3925 --n;
3926 /* Check if the follow up chars are all valid continuation bytes */
3927 if (n >= 1) {
3928 const unsigned char *cont;
3929 if ((p + n) >= end) {
3930 if (consumed == 0)
3931 /* incomplete data, non-incremental decoding */
3932 err = 1;
3933 break;
3934 }
3935 for (cont = p + 1; cont < (p + n); ++cont) {
3936 if ((*cont & 0xc0) != 0x80) {
3937 err = 1;
3938 break;
3939 }
3940 }
3941 p += n;
3942 }
3943 else
3944 err = 1;
3945 max_char = new_max;
3946 }
3947 }
3948
3949 if (unicode_size)
3950 *unicode_size = char_count;
3951 if (has_errors)
3952 *has_errors = err;
3953 return max_char;
3954}
3955
3956/* Similar to PyUnicode_WRITE but can also write into wstr field
3957 of the legacy unicode representation */
3958#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3959 do { \
3960 const int k_ = (kind); \
3961 if (k_ == PyUnicode_WCHAR_KIND) \
3962 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3963 else if (k_ == PyUnicode_1BYTE_KIND) \
3964 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3965 else if (k_ == PyUnicode_2BYTE_KIND) \
3966 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3967 else \
3968 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3969 } while (0)
3970
Alexander Belopolsky40018472011-02-26 01:02:56 +00003971PyObject *
3972PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 Py_ssize_t size,
3974 const char *errors,
3975 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003979 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t startinpos;
3981 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003982 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003984 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 PyObject *errorHandler = NULL;
3986 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 Py_UCS4 maxchar = 0;
3988 Py_ssize_t unicode_size;
3989 Py_ssize_t i;
3990 int kind;
3991 void *data;
3992 int has_errors;
3993 Py_UNICODE *error_outptr;
3994#if SIZEOF_WCHAR_T == 2
3995 Py_ssize_t wchar_offset = 0;
3996#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997
Walter Dörwald69652032004-09-07 20:24:22 +00003998 if (size == 0) {
3999 if (consumed)
4000 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4004 consumed, &has_errors);
4005 if (has_errors) {
4006 unicode = _PyUnicode_New(size);
4007 if (!unicode)
4008 return NULL;
4009 kind = PyUnicode_WCHAR_KIND;
4010 data = PyUnicode_AS_UNICODE(unicode);
4011 assert(data != NULL);
4012 }
4013 else {
4014 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4015 if (!unicode)
4016 return NULL;
4017 /* When the string is ASCII only, just use memcpy and return.
4018 unicode_size may be != size if there is an incomplete UTF-8
4019 sequence at the end of the ASCII block. */
4020 if (maxchar < 128 && size == unicode_size) {
4021 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4022 return (PyObject *)unicode;
4023 }
4024 kind = PyUnicode_KIND(unicode);
4025 data = PyUnicode_DATA(unicode);
4026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004030 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031
4032 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004033 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034
4035 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004036 /* Fast path for runs of ASCII characters. Given that common UTF-8
4037 input will consist of an overwhelming majority of ASCII
4038 characters, we try to optimize for this case by checking
4039 as many characters as a C 'long' can contain.
4040 First, check if we can do an aligned read, as most CPUs have
4041 a penalty for unaligned reads.
4042 */
4043 if (!((size_t) s & LONG_PTR_MASK)) {
4044 /* Help register allocation */
4045 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004047 while (_s < aligned_end) {
4048 /* Read a whole long at a time (either 4 or 8 bytes),
4049 and do a fast unrolled copy if it only contains ASCII
4050 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 unsigned long value = *(unsigned long *) _s;
4052 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004053 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4055 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4056 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4057 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004058#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4060 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4061 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4062 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004063#endif
4064 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004066 }
4067 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004069 if (s == e)
4070 break;
4071 ch = (unsigned char)*s;
4072 }
4073 }
4074
4075 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 s++;
4078 continue;
4079 }
4080
4081 n = utf8_code_length[ch];
4082
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004083 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 if (consumed)
4085 break;
4086 else {
4087 errmsg = "unexpected end of data";
4088 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004089 endinpos = startinpos+1;
4090 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4091 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 goto utf8Error;
4093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095
4096 switch (n) {
4097
4098 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004099 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 startinpos = s-starts;
4101 endinpos = startinpos+1;
4102 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103
4104 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004105 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 startinpos = s-starts;
4107 endinpos = startinpos+1;
4108 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109
4110 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004111 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004112 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004114 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 goto utf8Error;
4116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004118 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 break;
4121
4122 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004123 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4124 will result in surrogates in range d800-dfff. Surrogates are
4125 not valid UTF-8 so they are rejected.
4126 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4127 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004128 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004129 (s[2] & 0xc0) != 0x80 ||
4130 ((unsigned char)s[0] == 0xE0 &&
4131 (unsigned char)s[1] < 0xA0) ||
4132 ((unsigned char)s[0] == 0xED &&
4133 (unsigned char)s[1] > 0x9F)) {
4134 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004136 endinpos = startinpos + 1;
4137
4138 /* if s[1] first two bits are 1 and 0, then the invalid
4139 continuation byte is s[2], so increment endinpos by 1,
4140 if not, s[1] is invalid and endinpos doesn't need to
4141 be incremented. */
4142 if ((s[1] & 0xC0) == 0x80)
4143 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 goto utf8Error;
4145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004147 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004149 break;
4150
4151 case 4:
4152 if ((s[1] & 0xc0) != 0x80 ||
4153 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004154 (s[3] & 0xc0) != 0x80 ||
4155 ((unsigned char)s[0] == 0xF0 &&
4156 (unsigned char)s[1] < 0x90) ||
4157 ((unsigned char)s[0] == 0xF4 &&
4158 (unsigned char)s[1] > 0x8F)) {
4159 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004161 endinpos = startinpos + 1;
4162 if ((s[1] & 0xC0) == 0x80) {
4163 endinpos++;
4164 if ((s[2] & 0xC0) == 0x80)
4165 endinpos++;
4166 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto utf8Error;
4168 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004169 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004170 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4171 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 /* If the string is flexible or we have native UCS-4, write
4174 directly.. */
4175 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4176 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 else {
4179 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 /* translate from 10000..10FFFF to 0..FFFF */
4182 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 /* high surrogate = top 10 bits added to D800 */
4185 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4186 (Py_UNICODE)(0xD800 + (ch >> 10)));
4187
4188 /* low surrogate = bottom 10 bits added to DC00 */
4189 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4190 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4191 }
4192#if SIZEOF_WCHAR_T == 2
4193 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004194#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 }
4197 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004199
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 /* If this is not yet a resizable string, make it one.. */
4202 if (kind != PyUnicode_WCHAR_KIND) {
4203 const Py_UNICODE *u;
4204 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4205 if (!new_unicode)
4206 goto onError;
4207 u = PyUnicode_AsUnicode((PyObject *)unicode);
4208 if (!u)
4209 goto onError;
4210#if SIZEOF_WCHAR_T == 2
4211 i += wchar_offset;
4212#endif
4213 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4214 Py_DECREF(unicode);
4215 unicode = new_unicode;
4216 kind = 0;
4217 data = PyUnicode_AS_UNICODE(new_unicode);
4218 assert(data != NULL);
4219 }
4220 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 if (unicode_decode_call_errorhandler(
4222 errors, &errorHandler,
4223 "utf8", errmsg,
4224 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 /* Update data because unicode_decode_call_errorhandler might have
4228 re-created or resized the unicode object. */
4229 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 /* Ensure the unicode_size calculation above was correct: */
4233 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4234
Walter Dörwald69652032004-09-07 20:24:22 +00004235 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 /* Adjust length and ready string when it contained errors and
4239 is of the old resizable kind. */
4240 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004241 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 goto onError;
4243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 Py_XDECREF(errorHandler);
4246 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004247 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 Py_DECREF(unicode);
4249 return NULL;
4250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 return (PyObject *)unicode;
4252
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 Py_XDECREF(errorHandler);
4255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 Py_DECREF(unicode);
4257 return NULL;
4258}
4259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004261
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004262#ifdef __APPLE__
4263
4264/* Simplified UTF-8 decoder using surrogateescape error handler,
4265 used to decode the command line arguments on Mac OS X. */
4266
4267wchar_t*
4268_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4269{
4270 int n;
4271 const char *e;
4272 wchar_t *unicode, *p;
4273
4274 /* Note: size will always be longer than the resulting Unicode
4275 character count */
4276 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4277 PyErr_NoMemory();
4278 return NULL;
4279 }
4280 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4281 if (!unicode)
4282 return NULL;
4283
4284 /* Unpack UTF-8 encoded data */
4285 p = unicode;
4286 e = s + size;
4287 while (s < e) {
4288 Py_UCS4 ch = (unsigned char)*s;
4289
4290 if (ch < 0x80) {
4291 *p++ = (wchar_t)ch;
4292 s++;
4293 continue;
4294 }
4295
4296 n = utf8_code_length[ch];
4297 if (s + n > e) {
4298 goto surrogateescape;
4299 }
4300
4301 switch (n) {
4302 case 0:
4303 case 1:
4304 goto surrogateescape;
4305
4306 case 2:
4307 if ((s[1] & 0xc0) != 0x80)
4308 goto surrogateescape;
4309 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4310 assert ((ch > 0x007F) && (ch <= 0x07FF));
4311 *p++ = (wchar_t)ch;
4312 break;
4313
4314 case 3:
4315 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4316 will result in surrogates in range d800-dfff. Surrogates are
4317 not valid UTF-8 so they are rejected.
4318 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4319 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4320 if ((s[1] & 0xc0) != 0x80 ||
4321 (s[2] & 0xc0) != 0x80 ||
4322 ((unsigned char)s[0] == 0xE0 &&
4323 (unsigned char)s[1] < 0xA0) ||
4324 ((unsigned char)s[0] == 0xED &&
4325 (unsigned char)s[1] > 0x9F)) {
4326
4327 goto surrogateescape;
4328 }
4329 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4330 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004332 break;
4333
4334 case 4:
4335 if ((s[1] & 0xc0) != 0x80 ||
4336 (s[2] & 0xc0) != 0x80 ||
4337 (s[3] & 0xc0) != 0x80 ||
4338 ((unsigned char)s[0] == 0xF0 &&
4339 (unsigned char)s[1] < 0x90) ||
4340 ((unsigned char)s[0] == 0xF4 &&
4341 (unsigned char)s[1] > 0x8F)) {
4342 goto surrogateescape;
4343 }
4344 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4345 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4346 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4347
4348#if SIZEOF_WCHAR_T == 4
4349 *p++ = (wchar_t)ch;
4350#else
4351 /* compute and append the two surrogates: */
4352
4353 /* translate from 10000..10FFFF to 0..FFFF */
4354 ch -= 0x10000;
4355
4356 /* high surrogate = top 10 bits added to D800 */
4357 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4358
4359 /* low surrogate = bottom 10 bits added to DC00 */
4360 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4361#endif
4362 break;
4363 }
4364 s += n;
4365 continue;
4366
4367 surrogateescape:
4368 *p++ = 0xDC00 + ch;
4369 s++;
4370 }
4371 *p = L'\0';
4372 return unicode;
4373}
4374
4375#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377/* Primary internal function which creates utf8 encoded bytes objects.
4378
4379 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004380 and allocate exactly as much space needed at the end. Else allocate the
4381 maximum possible needed (4 result bytes per Unicode character), and return
4382 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004383*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004384PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004385_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
Tim Peters602f7402002-04-27 18:03:26 +00004387#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004388
Guido van Rossum98297ee2007-11-06 21:34:58 +00004389 Py_ssize_t i; /* index into s of next input byte */
4390 PyObject *result; /* result string object */
4391 char *p; /* next free byte in output buffer */
4392 Py_ssize_t nallocated; /* number of result bytes allocated */
4393 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004394 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004395 PyObject *errorHandler = NULL;
4396 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397 int kind;
4398 void *data;
4399 Py_ssize_t size;
4400 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4401#if SIZEOF_WCHAR_T == 2
4402 Py_ssize_t wchar_offset = 0;
4403#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004405 if (!PyUnicode_Check(unicode)) {
4406 PyErr_BadArgument();
4407 return NULL;
4408 }
4409
4410 if (PyUnicode_READY(unicode) == -1)
4411 return NULL;
4412
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004413 if (PyUnicode_UTF8(unicode))
4414 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4415 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416
4417 kind = PyUnicode_KIND(unicode);
4418 data = PyUnicode_DATA(unicode);
4419 size = PyUnicode_GET_LENGTH(unicode);
4420
Tim Peters602f7402002-04-27 18:03:26 +00004421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
Tim Peters602f7402002-04-27 18:03:26 +00004423 if (size <= MAX_SHORT_UNICHARS) {
4424 /* Write into the stack buffer; nallocated can't overflow.
4425 * At the end, we'll allocate exactly as much heap space as it
4426 * turns out we need.
4427 */
4428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004429 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004430 p = stackbuf;
4431 }
4432 else {
4433 /* Overallocate on the heap, and give the excess back at the end. */
4434 nallocated = size * 4;
4435 if (nallocated / 4 != size) /* overflow! */
4436 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004437 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004438 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004439 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004440 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004442
Tim Peters602f7402002-04-27 18:03:26 +00004443 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004444 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004452 *p++ = (char)(0xc0 | (ch >> 6));
4453 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004454 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 Py_ssize_t newpos;
4456 PyObject *rep;
4457 Py_ssize_t repsize, k, startpos;
4458 startpos = i-1;
4459#if SIZEOF_WCHAR_T == 2
4460 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004461#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 rep = unicode_encode_call_errorhandler(
4463 errors, &errorHandler, "utf-8", "surrogates not allowed",
4464 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4465 &exc, startpos, startpos+1, &newpos);
4466 if (!rep)
4467 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 if (PyBytes_Check(rep))
4470 repsize = PyBytes_GET_SIZE(rep);
4471 else
4472 repsize = PyUnicode_GET_SIZE(rep);
4473
4474 if (repsize > 4) {
4475 Py_ssize_t offset;
4476
4477 if (result == NULL)
4478 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004479 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004480 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4483 /* integer overflow */
4484 PyErr_NoMemory();
4485 goto error;
4486 }
4487 nallocated += repsize - 4;
4488 if (result != NULL) {
4489 if (_PyBytes_Resize(&result, nallocated) < 0)
4490 goto error;
4491 } else {
4492 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004493 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 goto error;
4495 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4496 }
4497 p = PyBytes_AS_STRING(result) + offset;
4498 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 if (PyBytes_Check(rep)) {
4501 char *prep = PyBytes_AS_STRING(rep);
4502 for(k = repsize; k > 0; k--)
4503 *p++ = *prep++;
4504 } else /* rep is unicode */ {
4505 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4506 Py_UNICODE c;
4507
4508 for(k=0; k<repsize; k++) {
4509 c = prep[k];
4510 if (0x80 <= c) {
4511 raise_encode_exception(&exc, "utf-8",
4512 PyUnicode_AS_UNICODE(unicode),
4513 size, i-1, i,
4514 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004515 goto error;
4516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004518 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004521 } else if (ch < 0x10000) {
4522 *p++ = (char)(0xe0 | (ch >> 12));
4523 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4524 *p++ = (char)(0x80 | (ch & 0x3f));
4525 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004526 /* Encode UCS4 Unicode ordinals */
4527 *p++ = (char)(0xf0 | (ch >> 18));
4528 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4529 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4530 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531#if SIZEOF_WCHAR_T == 2
4532 wchar_offset++;
4533#endif
Tim Peters602f7402002-04-27 18:03:26 +00004534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004536
Guido van Rossum98297ee2007-11-06 21:34:58 +00004537 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004538 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004539 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004540 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004541 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004542 }
4543 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004544 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004545 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004546 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004547 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004549
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004550 Py_XDECREF(errorHandler);
4551 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004552 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004553 error:
4554 Py_XDECREF(errorHandler);
4555 Py_XDECREF(exc);
4556 Py_XDECREF(result);
4557 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004558
Tim Peters602f7402002-04-27 18:03:26 +00004559#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560}
4561
Alexander Belopolsky40018472011-02-26 01:02:56 +00004562PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004563PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4564 Py_ssize_t size,
4565 const char *errors)
4566{
4567 PyObject *v, *unicode;
4568
4569 unicode = PyUnicode_FromUnicode(s, size);
4570 if (unicode == NULL)
4571 return NULL;
4572 v = _PyUnicode_AsUTF8String(unicode, errors);
4573 Py_DECREF(unicode);
4574 return v;
4575}
4576
4577PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004578PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581}
4582
Walter Dörwald41980ca2007-08-16 21:55:45 +00004583/* --- UTF-32 Codec ------------------------------------------------------- */
4584
4585PyObject *
4586PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_ssize_t size,
4588 const char *errors,
4589 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004590{
4591 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4592}
4593
4594PyObject *
4595PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 Py_ssize_t size,
4597 const char *errors,
4598 int *byteorder,
4599 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004600{
4601 const char *starts = s;
4602 Py_ssize_t startinpos;
4603 Py_ssize_t endinpos;
4604 Py_ssize_t outpos;
4605 PyUnicodeObject *unicode;
4606 Py_UNICODE *p;
4607#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004608 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004609 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610#else
4611 const int pairs = 0;
4612#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004613 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004614 int bo = 0; /* assume native ordering by default */
4615 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004616 /* Offsets from q for retrieving bytes in the right order. */
4617#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4618 int iorder[] = {0, 1, 2, 3};
4619#else
4620 int iorder[] = {3, 2, 1, 0};
4621#endif
4622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004624
Walter Dörwald41980ca2007-08-16 21:55:45 +00004625 q = (unsigned char *)s;
4626 e = q + size;
4627
4628 if (byteorder)
4629 bo = *byteorder;
4630
4631 /* Check for BOM marks (U+FEFF) in the input and adjust current
4632 byte order setting accordingly. In native mode, the leading BOM
4633 mark is skipped, in all other modes, it is copied to the output
4634 stream as-is (giving a ZWNBSP character). */
4635 if (bo == 0) {
4636 if (size >= 4) {
4637 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 if (bom == 0x0000FEFF) {
4641 q += 4;
4642 bo = -1;
4643 }
4644 else if (bom == 0xFFFE0000) {
4645 q += 4;
4646 bo = 1;
4647 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 if (bom == 0x0000FEFF) {
4650 q += 4;
4651 bo = 1;
4652 }
4653 else if (bom == 0xFFFE0000) {
4654 q += 4;
4655 bo = -1;
4656 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004657#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659 }
4660
4661 if (bo == -1) {
4662 /* force LE */
4663 iorder[0] = 0;
4664 iorder[1] = 1;
4665 iorder[2] = 2;
4666 iorder[3] = 3;
4667 }
4668 else if (bo == 1) {
4669 /* force BE */
4670 iorder[0] = 3;
4671 iorder[1] = 2;
4672 iorder[2] = 1;
4673 iorder[3] = 0;
4674 }
4675
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004676 /* On narrow builds we split characters outside the BMP into two
4677 codepoints => count how much extra space we need. */
4678#ifndef Py_UNICODE_WIDE
4679 for (qq = q; qq < e; qq += 4)
4680 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4681 pairs++;
4682#endif
4683
4684 /* This might be one to much, because of a BOM */
4685 unicode = _PyUnicode_New((size+3)/4+pairs);
4686 if (!unicode)
4687 return NULL;
4688 if (size == 0)
4689 return (PyObject *)unicode;
4690
4691 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004693
Walter Dörwald41980ca2007-08-16 21:55:45 +00004694 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 Py_UCS4 ch;
4696 /* remaining bytes at the end? (size should be divisible by 4) */
4697 if (e-q<4) {
4698 if (consumed)
4699 break;
4700 errmsg = "truncated data";
4701 startinpos = ((const char *)q)-starts;
4702 endinpos = ((const char *)e)-starts;
4703 goto utf32Error;
4704 /* The remaining input chars are ignored if the callback
4705 chooses to skip the input */
4706 }
4707 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4708 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004709
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 if (ch >= 0x110000)
4711 {
4712 errmsg = "codepoint not in range(0x110000)";
4713 startinpos = ((const char *)q)-starts;
4714 endinpos = startinpos+4;
4715 goto utf32Error;
4716 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004717#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 if (ch >= 0x10000)
4719 {
4720 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4721 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4722 }
4723 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 *p++ = ch;
4726 q += 4;
4727 continue;
4728 utf32Error:
4729 outpos = p-PyUnicode_AS_UNICODE(unicode);
4730 if (unicode_decode_call_errorhandler(
4731 errors, &errorHandler,
4732 "utf32", errmsg,
4733 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4734 &unicode, &outpos, &p))
4735 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004736 }
4737
4738 if (byteorder)
4739 *byteorder = bo;
4740
4741 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004743
4744 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004745 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746 goto onError;
4747
4748 Py_XDECREF(errorHandler);
4749 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004750 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751 Py_DECREF(unicode);
4752 return NULL;
4753 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004754 return (PyObject *)unicode;
4755
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757 Py_DECREF(unicode);
4758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
4760 return NULL;
4761}
4762
4763PyObject *
4764PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 Py_ssize_t size,
4766 const char *errors,
4767 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004768{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004770 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004771 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004772#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004773 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004774#else
4775 const int pairs = 0;
4776#endif
4777 /* Offsets from p for storing byte pairs in the right order. */
4778#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4779 int iorder[] = {0, 1, 2, 3};
4780#else
4781 int iorder[] = {3, 2, 1, 0};
4782#endif
4783
Benjamin Peterson29060642009-01-31 22:14:21 +00004784#define STORECHAR(CH) \
4785 do { \
4786 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4787 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4788 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4789 p[iorder[0]] = (CH) & 0xff; \
4790 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004791 } while(0)
4792
4793 /* In narrow builds we can output surrogate pairs as one codepoint,
4794 so we need less space. */
4795#ifndef Py_UNICODE_WIDE
4796 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4798 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4799 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004800#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004801 nsize = (size - pairs + (byteorder == 0));
4802 bytesize = nsize * 4;
4803 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004805 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806 if (v == NULL)
4807 return NULL;
4808
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004809 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004813 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004814
4815 if (byteorder == -1) {
4816 /* force LE */
4817 iorder[0] = 0;
4818 iorder[1] = 1;
4819 iorder[2] = 2;
4820 iorder[3] = 3;
4821 }
4822 else if (byteorder == 1) {
4823 /* force BE */
4824 iorder[0] = 3;
4825 iorder[1] = 2;
4826 iorder[2] = 1;
4827 iorder[3] = 0;
4828 }
4829
4830 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004832#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4834 Py_UCS4 ch2 = *s;
4835 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4836 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4837 s++;
4838 size--;
4839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841#endif
4842 STORECHAR(ch);
4843 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004844
4845 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004846 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004847#undef STORECHAR
4848}
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004852{
4853 if (!PyUnicode_Check(unicode)) {
4854 PyErr_BadArgument();
4855 return NULL;
4856 }
4857 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 PyUnicode_GET_SIZE(unicode),
4859 NULL,
4860 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004861}
4862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863/* --- UTF-16 Codec ------------------------------------------------------- */
4864
Tim Peters772747b2001-08-09 22:21:55 +00004865PyObject *
4866PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_ssize_t size,
4868 const char *errors,
4869 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald69652032004-09-07 20:24:22 +00004871 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4872}
4873
Antoine Pitrouab868312009-01-10 15:40:25 +00004874/* Two masks for fast checking of whether a C 'long' may contain
4875 UTF16-encoded surrogate characters. This is an efficient heuristic,
4876 assuming that non-surrogate characters with a code point >= 0x8000 are
4877 rare in most input.
4878 FAST_CHAR_MASK is used when the input is in native byte ordering,
4879 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004880*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004881#if (SIZEOF_LONG == 8)
4882# define FAST_CHAR_MASK 0x8000800080008000L
4883# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4884#elif (SIZEOF_LONG == 4)
4885# define FAST_CHAR_MASK 0x80008000L
4886# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4887#else
4888# error C 'long' size should be either 4 or 8!
4889#endif
4890
Walter Dörwald69652032004-09-07 20:24:22 +00004891PyObject *
4892PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 Py_ssize_t size,
4894 const char *errors,
4895 int *byteorder,
4896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t startinpos;
4900 Py_ssize_t endinpos;
4901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 PyUnicodeObject *unicode;
4903 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004904 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004905 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004906 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004907 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004908 /* Offsets from q for retrieving byte pairs in the right order. */
4909#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4910 int ihi = 1, ilo = 0;
4911#else
4912 int ihi = 0, ilo = 1;
4913#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 PyObject *errorHandler = NULL;
4915 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916
4917 /* Note: size will always be longer than the resulting Unicode
4918 character count */
4919 unicode = _PyUnicode_New(size);
4920 if (!unicode)
4921 return NULL;
4922 if (size == 0)
4923 return (PyObject *)unicode;
4924
4925 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004926 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004927 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004928 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929
4930 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004931 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004933 /* Check for BOM marks (U+FEFF) in the input and adjust current
4934 byte order setting accordingly. In native mode, the leading BOM
4935 mark is skipped, in all other modes, it is copied to the output
4936 stream as-is (giving a ZWNBSP character). */
4937 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004938 if (size >= 2) {
4939 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004940#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 if (bom == 0xFEFF) {
4942 q += 2;
4943 bo = -1;
4944 }
4945 else if (bom == 0xFFFE) {
4946 q += 2;
4947 bo = 1;
4948 }
Tim Petersced69f82003-09-16 20:30:58 +00004949#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004950 if (bom == 0xFEFF) {
4951 q += 2;
4952 bo = 1;
4953 }
4954 else if (bom == 0xFFFE) {
4955 q += 2;
4956 bo = -1;
4957 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004958#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961
Tim Peters772747b2001-08-09 22:21:55 +00004962 if (bo == -1) {
4963 /* force LE */
4964 ihi = 1;
4965 ilo = 0;
4966 }
4967 else if (bo == 1) {
4968 /* force BE */
4969 ihi = 0;
4970 ilo = 1;
4971 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004972#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4973 native_ordering = ilo < ihi;
4974#else
4975 native_ordering = ilo > ihi;
4976#endif
Tim Peters772747b2001-08-09 22:21:55 +00004977
Antoine Pitrouab868312009-01-10 15:40:25 +00004978 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004979 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004981 /* First check for possible aligned read of a C 'long'. Unaligned
4982 reads are more expensive, better to defer to another iteration. */
4983 if (!((size_t) q & LONG_PTR_MASK)) {
4984 /* Fast path for runs of non-surrogate chars. */
4985 register const unsigned char *_q = q;
4986 Py_UNICODE *_p = p;
4987 if (native_ordering) {
4988 /* Native ordering is simple: as long as the input cannot
4989 possibly contain a surrogate char, do an unrolled copy
4990 of several 16-bit code points to the target object.
4991 The non-surrogate check is done on several input bytes
4992 at a time (as many as a C 'long' can contain). */
4993 while (_q < aligned_end) {
4994 unsigned long data = * (unsigned long *) _q;
4995 if (data & FAST_CHAR_MASK)
4996 break;
4997 _p[0] = ((unsigned short *) _q)[0];
4998 _p[1] = ((unsigned short *) _q)[1];
4999#if (SIZEOF_LONG == 8)
5000 _p[2] = ((unsigned short *) _q)[2];
5001 _p[3] = ((unsigned short *) _q)[3];
5002#endif
5003 _q += SIZEOF_LONG;
5004 _p += SIZEOF_LONG / 2;
5005 }
5006 }
5007 else {
5008 /* Byteswapped ordering is similar, but we must decompose
5009 the copy bytewise, and take care of zero'ing out the
5010 upper bytes if the target object is in 32-bit units
5011 (that is, in UCS-4 builds). */
5012 while (_q < aligned_end) {
5013 unsigned long data = * (unsigned long *) _q;
5014 if (data & SWAPPED_FAST_CHAR_MASK)
5015 break;
5016 /* Zero upper bytes in UCS-4 builds */
5017#if (Py_UNICODE_SIZE > 2)
5018 _p[0] = 0;
5019 _p[1] = 0;
5020#if (SIZEOF_LONG == 8)
5021 _p[2] = 0;
5022 _p[3] = 0;
5023#endif
5024#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005025 /* Issue #4916; UCS-4 builds on big endian machines must
5026 fill the two last bytes of each 4-byte unit. */
5027#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5028# define OFF 2
5029#else
5030# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005031#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005032 ((unsigned char *) _p)[OFF + 1] = _q[0];
5033 ((unsigned char *) _p)[OFF + 0] = _q[1];
5034 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5035 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5036#if (SIZEOF_LONG == 8)
5037 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5038 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5039 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5040 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5041#endif
5042#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005043 _q += SIZEOF_LONG;
5044 _p += SIZEOF_LONG / 2;
5045 }
5046 }
5047 p = _p;
5048 q = _q;
5049 if (q >= e)
5050 break;
5051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053
Benjamin Peterson14339b62009-01-31 16:36:08 +00005054 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005055
5056 if (ch < 0xD800 || ch > 0xDFFF) {
5057 *p++ = ch;
5058 continue;
5059 }
5060
5061 /* UTF-16 code pair: */
5062 if (q > e) {
5063 errmsg = "unexpected end of data";
5064 startinpos = (((const char *)q) - 2) - starts;
5065 endinpos = ((const char *)e) + 1 - starts;
5066 goto utf16Error;
5067 }
5068 if (0xD800 <= ch && ch <= 0xDBFF) {
5069 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5070 q += 2;
5071 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005072#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 *p++ = ch;
5074 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005075#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005077#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 continue;
5079 }
5080 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005081 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 startinpos = (((const char *)q)-4)-starts;
5083 endinpos = startinpos+2;
5084 goto utf16Error;
5085 }
5086
Benjamin Peterson14339b62009-01-31 16:36:08 +00005087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 errmsg = "illegal encoding";
5089 startinpos = (((const char *)q)-2)-starts;
5090 endinpos = startinpos+2;
5091 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 utf16Error:
5094 outpos = p - PyUnicode_AS_UNICODE(unicode);
5095 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005096 errors,
5097 &errorHandler,
5098 "utf16", errmsg,
5099 &starts,
5100 (const char **)&e,
5101 &startinpos,
5102 &endinpos,
5103 &exc,
5104 (const char **)&q,
5105 &unicode,
5106 &outpos,
5107 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005110 /* remaining byte at the end? (size should be even) */
5111 if (e == q) {
5112 if (!consumed) {
5113 errmsg = "truncated data";
5114 startinpos = ((const char *)q) - starts;
5115 endinpos = ((const char *)e) + 1 - starts;
5116 outpos = p - PyUnicode_AS_UNICODE(unicode);
5117 if (unicode_decode_call_errorhandler(
5118 errors,
5119 &errorHandler,
5120 "utf16", errmsg,
5121 &starts,
5122 (const char **)&e,
5123 &startinpos,
5124 &endinpos,
5125 &exc,
5126 (const char **)&q,
5127 &unicode,
5128 &outpos,
5129 &p))
5130 goto onError;
5131 /* The remaining input chars are ignored if the callback
5132 chooses to skip the input */
5133 }
5134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
5136 if (byteorder)
5137 *byteorder = bo;
5138
Walter Dörwald69652032004-09-07 20:24:22 +00005139 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005143 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 goto onError;
5145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 Py_XDECREF(errorHandler);
5147 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005148 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 Py_DECREF(unicode);
5150 return NULL;
5151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return (PyObject *)unicode;
5153
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 Py_XDECREF(errorHandler);
5157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 return NULL;
5159}
5160
Antoine Pitrouab868312009-01-10 15:40:25 +00005161#undef FAST_CHAR_MASK
5162#undef SWAPPED_FAST_CHAR_MASK
5163
Tim Peters772747b2001-08-09 22:21:55 +00005164PyObject *
5165PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 Py_ssize_t size,
5167 const char *errors,
5168 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005170 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005171 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005172 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005173#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005174 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005175#else
5176 const int pairs = 0;
5177#endif
Tim Peters772747b2001-08-09 22:21:55 +00005178 /* Offsets from p for storing byte pairs in the right order. */
5179#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5180 int ihi = 1, ilo = 0;
5181#else
5182 int ihi = 0, ilo = 1;
5183#endif
5184
Benjamin Peterson29060642009-01-31 22:14:21 +00005185#define STORECHAR(CH) \
5186 do { \
5187 p[ihi] = ((CH) >> 8) & 0xff; \
5188 p[ilo] = (CH) & 0xff; \
5189 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005190 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005193 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (s[i] >= 0x10000)
5195 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005196#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005197 /* 2 * (size + pairs + (byteorder == 0)) */
5198 if (size > PY_SSIZE_T_MAX ||
5199 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005201 nsize = size + pairs + (byteorder == 0);
5202 bytesize = nsize * 2;
5203 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005205 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 if (v == NULL)
5207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005209 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005212 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005213 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005214
5215 if (byteorder == -1) {
5216 /* force LE */
5217 ihi = 1;
5218 ilo = 0;
5219 }
5220 else if (byteorder == 1) {
5221 /* force BE */
5222 ihi = 0;
5223 ilo = 1;
5224 }
5225
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005226 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 Py_UNICODE ch = *s++;
5228 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005229#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (ch >= 0x10000) {
5231 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5232 ch = 0xD800 | ((ch-0x10000) >> 10);
5233 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005234#endif
Tim Peters772747b2001-08-09 22:21:55 +00005235 STORECHAR(ch);
5236 if (ch2)
5237 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005238 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005239
5240 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005241 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005242#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243}
5244
Alexander Belopolsky40018472011-02-26 01:02:56 +00005245PyObject *
5246PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
5248 if (!PyUnicode_Check(unicode)) {
5249 PyErr_BadArgument();
5250 return NULL;
5251 }
5252 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 PyUnicode_GET_SIZE(unicode),
5254 NULL,
5255 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
5258/* --- Unicode Escape Codec ----------------------------------------------- */
5259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005260/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5261 if all the escapes in the string make it still a valid ASCII string.
5262 Returns -1 if any escapes were found which cause the string to
5263 pop out of ASCII range. Otherwise returns the length of the
5264 required buffer to hold the string.
5265 */
5266Py_ssize_t
5267length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5268{
5269 const unsigned char *p = (const unsigned char *)s;
5270 const unsigned char *end = p + size;
5271 Py_ssize_t length = 0;
5272
5273 if (size < 0)
5274 return -1;
5275
5276 for (; p < end; ++p) {
5277 if (*p > 127) {
5278 /* Non-ASCII */
5279 return -1;
5280 }
5281 else if (*p != '\\') {
5282 /* Normal character */
5283 ++length;
5284 }
5285 else {
5286 /* Backslash-escape, check next char */
5287 ++p;
5288 /* Escape sequence reaches till end of string or
5289 non-ASCII follow-up. */
5290 if (p >= end || *p > 127)
5291 return -1;
5292 switch (*p) {
5293 case '\n':
5294 /* backslash + \n result in zero characters */
5295 break;
5296 case '\\': case '\'': case '\"':
5297 case 'b': case 'f': case 't':
5298 case 'n': case 'r': case 'v': case 'a':
5299 ++length;
5300 break;
5301 case '0': case '1': case '2': case '3':
5302 case '4': case '5': case '6': case '7':
5303 case 'x': case 'u': case 'U': case 'N':
5304 /* these do not guarantee ASCII characters */
5305 return -1;
5306 default:
5307 /* count the backslash + the other character */
5308 length += 2;
5309 }
5310 }
5311 }
5312 return length;
5313}
5314
5315/* Similar to PyUnicode_WRITE but either write into wstr field
5316 or treat string as ASCII. */
5317#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5318 do { \
5319 if ((kind) != PyUnicode_WCHAR_KIND) \
5320 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5321 else \
5322 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5323 } while (0)
5324
5325#define WRITE_WSTR(buf, index, value) \
5326 assert(kind == PyUnicode_WCHAR_KIND), \
5327 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5328
5329
Fredrik Lundh06d12682001-01-24 07:59:11 +00005330static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005331
Alexander Belopolsky40018472011-02-26 01:02:56 +00005332PyObject *
5333PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005334 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005335 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t startinpos;
5339 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005340 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005344 char* message;
5345 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 PyObject *errorHandler = NULL;
5347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005348 Py_ssize_t ascii_length;
5349 Py_ssize_t i;
5350 int kind;
5351 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005353 ascii_length = length_of_escaped_ascii_string(s, size);
5354
5355 /* After length_of_escaped_ascii_string() there are two alternatives,
5356 either the string is pure ASCII with named escapes like \n, etc.
5357 and we determined it's exact size (common case)
5358 or it contains \x, \u, ... escape sequences. then we create a
5359 legacy wchar string and resize it at the end of this function. */
5360 if (ascii_length >= 0) {
5361 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5362 if (!v)
5363 goto onError;
5364 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5365 kind = PyUnicode_1BYTE_KIND;
5366 data = PyUnicode_DATA(v);
5367 }
5368 else {
5369 /* Escaped strings will always be longer than the resulting
5370 Unicode string, so we start with size here and then reduce the
5371 length after conversion to the true value.
5372 (but if the error callback returns a long replacement string
5373 we'll have to allocate more space) */
5374 v = _PyUnicode_New(size);
5375 if (!v)
5376 goto onError;
5377 kind = PyUnicode_WCHAR_KIND;
5378 data = PyUnicode_AS_UNICODE(v);
5379 }
5380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (size == 0)
5382 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005385
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 while (s < end) {
5387 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005388 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005391 if (kind == PyUnicode_WCHAR_KIND) {
5392 assert(i < _PyUnicode_WSTR_LENGTH(v));
5393 }
5394 else {
5395 /* The only case in which i == ascii_length is a backslash
5396 followed by a newline. */
5397 assert(i <= ascii_length);
5398 }
5399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 /* Non-escape characters are interpreted as Unicode ordinals */
5401 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 continue;
5404 }
5405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 /* \ - Escapes */
5408 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005409 c = *s++;
5410 if (s > end)
5411 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412
5413 if (kind == PyUnicode_WCHAR_KIND) {
5414 assert(i < _PyUnicode_WSTR_LENGTH(v));
5415 }
5416 else {
5417 /* The only case in which i == ascii_length is a backslash
5418 followed by a newline. */
5419 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5420 }
5421
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005422 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5427 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5428 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5429 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5430 /* FF */
5431 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5432 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5433 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5434 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5435 /* VT */
5436 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5437 /* BEL, not classic C */
5438 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 case '0': case '1': case '2': case '3':
5442 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005443 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005444 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005445 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005446 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005447 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 break;
5451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 /* hex escapes */
5453 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455 digits = 2;
5456 message = "truncated \\xXX escape";
5457 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005461 digits = 4;
5462 message = "truncated \\uXXXX escape";
5463 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005466 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005467 digits = 8;
5468 message = "truncated \\UXXXXXXXX escape";
5469 hexescape:
5470 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 if (s+digits>end) {
5473 endinpos = size;
5474 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 errors, &errorHandler,
5476 "unicodeescape", "end of string in escape sequence",
5477 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 goto nextByte;
5482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005483 for (j = 0; j < digits; ++j) {
5484 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005485 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 endinpos = (s+j+1)-starts;
5487 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 errors, &errorHandler,
5490 "unicodeescape", message,
5491 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005496 }
5497 chr = (chr<<4) & ~0xF;
5498 if (c >= '0' && c <= '9')
5499 chr += c - '0';
5500 else if (c >= 'a' && c <= 'f')
5501 chr += 10 + c - 'a';
5502 else
5503 chr += 10 + c - 'A';
5504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005506 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 /* _decoding_error will have already written into the
5508 target buffer. */
5509 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005511 /* when we get here, chr is a 32-bit unicode character */
5512 if (chr <= 0xffff)
5513 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005516 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005517 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005518#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005520#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005521 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5523 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005525 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005527 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 errors, &errorHandler,
5530 "unicodeescape", "illegal Unicode character",
5531 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005533 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 break;
5537
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005539 case 'N':
5540 message = "malformed \\N character escape";
5541 if (ucnhash_CAPI == NULL) {
5542 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5544 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 if (ucnhash_CAPI == NULL)
5546 goto ucnhashError;
5547 }
5548 if (*s == '{') {
5549 const char *start = s+1;
5550 /* look for the closing brace */
5551 while (*s != '}' && s < end)
5552 s++;
5553 if (s > start && s < end && *s == '}') {
5554 /* found a name. look it up in the unicode database */
5555 message = "unknown Unicode character name";
5556 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5558 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005559 goto store;
5560 }
5561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 errors, &errorHandler,
5566 "unicodeescape", message,
5567 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005571 break;
5572
5573 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005574 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 message = "\\ at end of string";
5577 s--;
5578 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 errors, &errorHandler,
5582 "unicodeescape", message,
5583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005585 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005586 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005587 }
5588 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5590 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005591 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005592 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005597 /* Ensure the length prediction worked in case of ASCII strings */
5598 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5599
Victor Stinnerfe226c02011-10-03 03:52:20 +02005600 if (kind == PyUnicode_WCHAR_KIND)
5601 {
5602 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5603 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005604 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005605 Py_XDECREF(errorHandler);
5606 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005607 if (_PyUnicode_READY_REPLACE(&v)) {
5608 Py_DECREF(v);
5609 return NULL;
5610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005614 PyErr_SetString(
5615 PyExc_UnicodeError,
5616 "\\N escapes not supported (can't load unicodedata module)"
5617 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005618 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005621 return NULL;
5622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return NULL;
5628}
5629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630#undef WRITE_ASCII_OR_WSTR
5631#undef WRITE_WSTR
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633/* Return a Unicode-Escape string version of the Unicode object.
5634
5635 If quotes is true, the string is enclosed in u"" or u'' quotes as
5636 appropriate.
5637
5638*/
5639
Walter Dörwald79e913e2007-05-12 11:08:06 +00005640static const char *hexdigits = "0123456789abcdef";
5641
Alexander Belopolsky40018472011-02-26 01:02:56 +00005642PyObject *
5643PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005646 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005649#ifdef Py_UNICODE_WIDE
5650 const Py_ssize_t expandsize = 10;
5651#else
5652 const Py_ssize_t expandsize = 6;
5653#endif
5654
Thomas Wouters89f507f2006-12-13 04:49:30 +00005655 /* XXX(nnorwitz): rather than over-allocating, it would be
5656 better to choose a different scheme. Perhaps scan the
5657 first N-chars of the string and allocate based on that size.
5658 */
5659 /* Initial allocation is based on the longest-possible unichr
5660 escape.
5661
5662 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5663 unichr, so in this case it's the longest unichr escape. In
5664 narrow (UTF-16) builds this is five chars per source unichr
5665 since there are two unichrs in the surrogate pair, so in narrow
5666 (UTF-16) builds it's not the longest unichr escape.
5667
5668 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5669 so in the narrow (UTF-16) build case it's the longest unichr
5670 escape.
5671 */
5672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005673 if (size == 0)
5674 return PyBytes_FromStringAndSize(NULL, 0);
5675
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005676 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005678
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005679 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 2
5681 + expandsize*size
5682 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (repr == NULL)
5684 return NULL;
5685
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005686 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (size-- > 0) {
5689 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005690
Walter Dörwald79e913e2007-05-12 11:08:06 +00005691 /* Escape backslashes */
5692 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 *p++ = '\\';
5694 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005695 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005696 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005697
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005698#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005699 /* Map 21-bit characters to '\U00xxxxxx' */
5700 else if (ch >= 0x10000) {
5701 *p++ = '\\';
5702 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005703 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5704 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5705 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5706 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5707 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5708 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5709 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5710 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005712 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005713#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5715 else if (ch >= 0xD800 && ch < 0xDC00) {
5716 Py_UNICODE ch2;
5717 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 ch2 = *s++;
5720 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005721 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5723 *p++ = '\\';
5724 *p++ = 'U';
5725 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5726 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5727 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5728 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5729 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5730 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5731 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5732 *p++ = hexdigits[ucs & 0x0000000F];
5733 continue;
5734 }
5735 /* Fall through: isolated surrogates are copied as-is */
5736 s--;
5737 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005739#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005740
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005742 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 *p++ = '\\';
5744 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005745 *p++ = hexdigits[(ch >> 12) & 0x000F];
5746 *p++ = hexdigits[(ch >> 8) & 0x000F];
5747 *p++ = hexdigits[(ch >> 4) & 0x000F];
5748 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005750
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005751 /* Map special whitespace to '\t', \n', '\r' */
5752 else if (ch == '\t') {
5753 *p++ = '\\';
5754 *p++ = 't';
5755 }
5756 else if (ch == '\n') {
5757 *p++ = '\\';
5758 *p++ = 'n';
5759 }
5760 else if (ch == '\r') {
5761 *p++ = '\\';
5762 *p++ = 'r';
5763 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005764
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005765 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005766 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005768 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005769 *p++ = hexdigits[(ch >> 4) & 0x000F];
5770 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005771 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 /* Copy everything else as-is */
5774 else
5775 *p++ = (char) ch;
5776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005778 assert(p - PyBytes_AS_STRING(repr) > 0);
5779 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5780 return NULL;
5781 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
Alexander Belopolsky40018472011-02-26 01:02:56 +00005784PyObject *
5785PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005787 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 if (!PyUnicode_Check(unicode)) {
5789 PyErr_BadArgument();
5790 return NULL;
5791 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005792 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5793 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005794 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795}
5796
5797/* --- Raw Unicode Escape Codec ------------------------------------------- */
5798
Alexander Belopolsky40018472011-02-26 01:02:56 +00005799PyObject *
5800PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005801 Py_ssize_t size,
5802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005805 Py_ssize_t startinpos;
5806 Py_ssize_t endinpos;
5807 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 const char *end;
5811 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 PyObject *errorHandler = NULL;
5813 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 /* Escaped strings will always be longer than the resulting
5816 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 length after conversion to the true value. (But decoding error
5818 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 v = _PyUnicode_New(size);
5820 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 end = s + size;
5826 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 unsigned char c;
5828 Py_UCS4 x;
5829 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005830 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* Non-escape characters are interpreted as Unicode ordinals */
5833 if (*s != '\\') {
5834 *p++ = (unsigned char)*s++;
5835 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 startinpos = s-starts;
5838
5839 /* \u-escapes are only interpreted iff the number of leading
5840 backslashes if odd */
5841 bs = s;
5842 for (;s < end;) {
5843 if (*s != '\\')
5844 break;
5845 *p++ = (unsigned char)*s++;
5846 }
5847 if (((s - bs) & 1) == 0 ||
5848 s >= end ||
5849 (*s != 'u' && *s != 'U')) {
5850 continue;
5851 }
5852 p--;
5853 count = *s=='u' ? 4 : 8;
5854 s++;
5855
5856 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5857 outpos = p-PyUnicode_AS_UNICODE(v);
5858 for (x = 0, i = 0; i < count; ++i, ++s) {
5859 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005860 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 endinpos = s-starts;
5862 if (unicode_decode_call_errorhandler(
5863 errors, &errorHandler,
5864 "rawunicodeescape", "truncated \\uXXXX",
5865 &starts, &end, &startinpos, &endinpos, &exc, &s,
5866 &v, &outpos, &p))
5867 goto onError;
5868 goto nextByte;
5869 }
5870 x = (x<<4) & ~0xF;
5871 if (c >= '0' && c <= '9')
5872 x += c - '0';
5873 else if (c >= 'a' && c <= 'f')
5874 x += 10 + c - 'a';
5875 else
5876 x += 10 + c - 'A';
5877 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005878 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 /* UCS-2 character */
5880 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005881 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 /* UCS-4 character. Either store directly, or as
5883 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005884#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005886#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 x -= 0x10000L;
5888 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5889 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005890#endif
5891 } else {
5892 endinpos = s-starts;
5893 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894 if (unicode_decode_call_errorhandler(
5895 errors, &errorHandler,
5896 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 &starts, &end, &startinpos, &endinpos, &exc, &s,
5898 &v, &outpos, &p))
5899 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 nextByte:
5902 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005904 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 Py_XDECREF(errorHandler);
5907 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005908 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005909 Py_DECREF(v);
5910 return NULL;
5911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(errorHandler);
5917 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 return NULL;
5919}
5920
Alexander Belopolsky40018472011-02-26 01:02:56 +00005921PyObject *
5922PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005923 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 char *p;
5927 char *q;
5928
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005929#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005930 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005931#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005932 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005933#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005934
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005935 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005937
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (repr == NULL)
5940 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005941 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 while (size-- > 0) {
5946 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005947#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 /* Map 32-bit characters to '\Uxxxxxxxx' */
5949 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005950 *p++ = '\\';
5951 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005952 *p++ = hexdigits[(ch >> 28) & 0xf];
5953 *p++ = hexdigits[(ch >> 24) & 0xf];
5954 *p++ = hexdigits[(ch >> 20) & 0xf];
5955 *p++ = hexdigits[(ch >> 16) & 0xf];
5956 *p++ = hexdigits[(ch >> 12) & 0xf];
5957 *p++ = hexdigits[(ch >> 8) & 0xf];
5958 *p++ = hexdigits[(ch >> 4) & 0xf];
5959 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005960 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005961 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005962#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5964 if (ch >= 0xD800 && ch < 0xDC00) {
5965 Py_UNICODE ch2;
5966 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005967
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 ch2 = *s++;
5969 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005970 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5972 *p++ = '\\';
5973 *p++ = 'U';
5974 *p++ = hexdigits[(ucs >> 28) & 0xf];
5975 *p++ = hexdigits[(ucs >> 24) & 0xf];
5976 *p++ = hexdigits[(ucs >> 20) & 0xf];
5977 *p++ = hexdigits[(ucs >> 16) & 0xf];
5978 *p++ = hexdigits[(ucs >> 12) & 0xf];
5979 *p++ = hexdigits[(ucs >> 8) & 0xf];
5980 *p++ = hexdigits[(ucs >> 4) & 0xf];
5981 *p++ = hexdigits[ucs & 0xf];
5982 continue;
5983 }
5984 /* Fall through: isolated surrogates are copied as-is */
5985 s--;
5986 size++;
5987 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005988#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* Map 16-bit characters to '\uxxxx' */
5990 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 *p++ = '\\';
5992 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005993 *p++ = hexdigits[(ch >> 12) & 0xf];
5994 *p++ = hexdigits[(ch >> 8) & 0xf];
5995 *p++ = hexdigits[(ch >> 4) & 0xf];
5996 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* Copy everything else as-is */
5999 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = (char) ch;
6001 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006002 size = p - q;
6003
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006004 assert(size > 0);
6005 if (_PyBytes_Resize(&repr, size) < 0)
6006 return NULL;
6007 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008}
6009
Alexander Belopolsky40018472011-02-26 01:02:56 +00006010PyObject *
6011PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006013 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006015 PyErr_BadArgument();
6016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006018 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6019 PyUnicode_GET_SIZE(unicode));
6020
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006021 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006024/* --- Unicode Internal Codec ------------------------------------------- */
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
6027_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 Py_ssize_t size,
6029 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030{
6031 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 Py_ssize_t startinpos;
6033 Py_ssize_t endinpos;
6034 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006035 PyUnicodeObject *v;
6036 Py_UNICODE *p;
6037 const char *end;
6038 const char *reason;
6039 PyObject *errorHandler = NULL;
6040 PyObject *exc = NULL;
6041
Neal Norwitzd43069c2006-01-08 01:12:10 +00006042#ifdef Py_UNICODE_WIDE
6043 Py_UNICODE unimax = PyUnicode_GetMax();
6044#endif
6045
Thomas Wouters89f507f2006-12-13 04:49:30 +00006046 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6048 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6051 as string was created with the old API. */
6052 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006054 p = PyUnicode_AS_UNICODE(v);
6055 end = s + size;
6056
6057 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006058 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006059 /* We have to sanity check the raw data, otherwise doom looms for
6060 some malformed UCS-4 data. */
6061 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006062#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006063 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006064#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006065 end-s < Py_UNICODE_SIZE
6066 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 startinpos = s - starts;
6069 if (end-s < Py_UNICODE_SIZE) {
6070 endinpos = end-starts;
6071 reason = "truncated input";
6072 }
6073 else {
6074 endinpos = s - starts + Py_UNICODE_SIZE;
6075 reason = "illegal code point (> 0x10FFFF)";
6076 }
6077 outpos = p - PyUnicode_AS_UNICODE(v);
6078 if (unicode_decode_call_errorhandler(
6079 errors, &errorHandler,
6080 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006081 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006082 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006083 goto onError;
6084 }
6085 }
6086 else {
6087 p++;
6088 s += Py_UNICODE_SIZE;
6089 }
6090 }
6091
Victor Stinnerfe226c02011-10-03 03:52:20 +02006092 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006093 goto onError;
6094 Py_XDECREF(errorHandler);
6095 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006096 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006097 Py_DECREF(v);
6098 return NULL;
6099 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006100 return (PyObject *)v;
6101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006103 Py_XDECREF(v);
6104 Py_XDECREF(errorHandler);
6105 Py_XDECREF(exc);
6106 return NULL;
6107}
6108
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109/* --- Latin-1 Codec ------------------------------------------------------ */
6110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
6112PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 Py_ssize_t size,
6114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006117 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118}
6119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006121static void
6122make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006123 const char *encoding,
6124 const Py_UNICODE *unicode, Py_ssize_t size,
6125 Py_ssize_t startpos, Py_ssize_t endpos,
6126 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 *exceptionObject = PyUnicodeEncodeError_Create(
6130 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
6132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6134 goto onError;
6135 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6136 goto onError;
6137 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6138 goto onError;
6139 return;
6140 onError:
6141 Py_DECREF(*exceptionObject);
6142 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 }
6144}
6145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147static void
6148raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006149 const char *encoding,
6150 const Py_UNICODE *unicode, Py_ssize_t size,
6151 Py_ssize_t startpos, Py_ssize_t endpos,
6152 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153{
6154 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158}
6159
6160/* error handling callback helper:
6161 build arguments, call the callback and check the arguments,
6162 put the result into newpos and return the replacement string, which
6163 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164static PyObject *
6165unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006166 PyObject **errorHandler,
6167 const char *encoding, const char *reason,
6168 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6169 Py_ssize_t startpos, Py_ssize_t endpos,
6170 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006172 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173
6174 PyObject *restuple;
6175 PyObject *resunicode;
6176
6177 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 }
6182
6183 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187
6188 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006193 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 Py_DECREF(restuple);
6195 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006197 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 &resunicode, newpos)) {
6199 Py_DECREF(restuple);
6200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006202 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6203 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6204 Py_DECREF(restuple);
6205 return NULL;
6206 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006209 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6211 Py_DECREF(restuple);
6212 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_INCREF(resunicode);
6215 Py_DECREF(restuple);
6216 return resunicode;
6217}
6218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219static PyObject *
6220unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006221 Py_ssize_t size,
6222 const char *errors,
6223 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224{
6225 /* output object */
6226 PyObject *res;
6227 /* pointers to the beginning and end+1 of input */
6228 const Py_UNICODE *startp = p;
6229 const Py_UNICODE *endp = p + size;
6230 /* pointer to the beginning of the unencodable characters */
6231 /* const Py_UNICODE *badp = NULL; */
6232 /* pointer into the output */
6233 char *str;
6234 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006235 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006236 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6237 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 PyObject *errorHandler = NULL;
6239 PyObject *exc = NULL;
6240 /* the following variable is used for caching string comparisons
6241 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6242 int known_errorHandler = -1;
6243
6244 /* allocate enough for a simple encoding without
6245 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006246 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006247 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006250 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006251 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 ressize = size;
6253
6254 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 /* can we encode this? */
6258 if (c<limit) {
6259 /* no overflow check, because we know that the space is enough */
6260 *str++ = (char)c;
6261 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006262 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 else {
6264 Py_ssize_t unicodepos = p-startp;
6265 Py_ssize_t requiredsize;
6266 PyObject *repunicode;
6267 Py_ssize_t repsize;
6268 Py_ssize_t newpos;
6269 Py_ssize_t respos;
6270 Py_UNICODE *uni2;
6271 /* startpos for collecting unencodable chars */
6272 const Py_UNICODE *collstart = p;
6273 const Py_UNICODE *collend = p;
6274 /* find all unecodable characters */
6275 while ((collend < endp) && ((*collend)>=limit))
6276 ++collend;
6277 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6278 if (known_errorHandler==-1) {
6279 if ((errors==NULL) || (!strcmp(errors, "strict")))
6280 known_errorHandler = 1;
6281 else if (!strcmp(errors, "replace"))
6282 known_errorHandler = 2;
6283 else if (!strcmp(errors, "ignore"))
6284 known_errorHandler = 3;
6285 else if (!strcmp(errors, "xmlcharrefreplace"))
6286 known_errorHandler = 4;
6287 else
6288 known_errorHandler = 0;
6289 }
6290 switch (known_errorHandler) {
6291 case 1: /* strict */
6292 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6293 goto onError;
6294 case 2: /* replace */
6295 while (collstart++<collend)
6296 *str++ = '?'; /* fall through */
6297 case 3: /* ignore */
6298 p = collend;
6299 break;
6300 case 4: /* xmlcharrefreplace */
6301 respos = str - PyBytes_AS_STRING(res);
6302 /* determine replacement size (temporarily (mis)uses p) */
6303 for (p = collstart, repsize = 0; p < collend; ++p) {
6304 if (*p<10)
6305 repsize += 2+1+1;
6306 else if (*p<100)
6307 repsize += 2+2+1;
6308 else if (*p<1000)
6309 repsize += 2+3+1;
6310 else if (*p<10000)
6311 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006312#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 else
6314 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006315#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 else if (*p<100000)
6317 repsize += 2+5+1;
6318 else if (*p<1000000)
6319 repsize += 2+6+1;
6320 else
6321 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006322#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 }
6324 requiredsize = respos+repsize+(endp-collend);
6325 if (requiredsize > ressize) {
6326 if (requiredsize<2*ressize)
6327 requiredsize = 2*ressize;
6328 if (_PyBytes_Resize(&res, requiredsize))
6329 goto onError;
6330 str = PyBytes_AS_STRING(res) + respos;
6331 ressize = requiredsize;
6332 }
6333 /* generate replacement (temporarily (mis)uses p) */
6334 for (p = collstart; p < collend; ++p) {
6335 str += sprintf(str, "&#%d;", (int)*p);
6336 }
6337 p = collend;
6338 break;
6339 default:
6340 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6341 encoding, reason, startp, size, &exc,
6342 collstart-startp, collend-startp, &newpos);
6343 if (repunicode == NULL)
6344 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006345 if (PyBytes_Check(repunicode)) {
6346 /* Directly copy bytes result to output. */
6347 repsize = PyBytes_Size(repunicode);
6348 if (repsize > 1) {
6349 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006350 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006351 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6352 Py_DECREF(repunicode);
6353 goto onError;
6354 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006355 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006356 ressize += repsize-1;
6357 }
6358 memcpy(str, PyBytes_AsString(repunicode), repsize);
6359 str += repsize;
6360 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006361 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006362 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* need more space? (at least enough for what we
6365 have+the replacement+the rest of the string, so
6366 we won't have to check space for encodable characters) */
6367 respos = str - PyBytes_AS_STRING(res);
6368 repsize = PyUnicode_GET_SIZE(repunicode);
6369 requiredsize = respos+repsize+(endp-collend);
6370 if (requiredsize > ressize) {
6371 if (requiredsize<2*ressize)
6372 requiredsize = 2*ressize;
6373 if (_PyBytes_Resize(&res, requiredsize)) {
6374 Py_DECREF(repunicode);
6375 goto onError;
6376 }
6377 str = PyBytes_AS_STRING(res) + respos;
6378 ressize = requiredsize;
6379 }
6380 /* check if there is anything unencodable in the replacement
6381 and copy it to the output */
6382 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6383 c = *uni2;
6384 if (c >= limit) {
6385 raise_encode_exception(&exc, encoding, startp, size,
6386 unicodepos, unicodepos+1, reason);
6387 Py_DECREF(repunicode);
6388 goto onError;
6389 }
6390 *str = (char)c;
6391 }
6392 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006393 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006395 }
6396 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006397 /* Resize if we allocated to much */
6398 size = str - PyBytes_AS_STRING(res);
6399 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006400 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006401 if (_PyBytes_Resize(&res, size) < 0)
6402 goto onError;
6403 }
6404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 Py_XDECREF(errorHandler);
6406 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006407 return res;
6408
6409 onError:
6410 Py_XDECREF(res);
6411 Py_XDECREF(errorHandler);
6412 Py_XDECREF(exc);
6413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414}
6415
Alexander Belopolsky40018472011-02-26 01:02:56 +00006416PyObject *
6417PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006418 Py_ssize_t size,
6419 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
6427 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 PyErr_BadArgument();
6429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431 if (PyUnicode_READY(unicode) == -1)
6432 return NULL;
6433 /* Fast path: if it is a one-byte string, construct
6434 bytes object directly. */
6435 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6436 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6437 PyUnicode_GET_LENGTH(unicode));
6438 /* Non-Latin-1 characters present. Defer to above function to
6439 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006442 errors);
6443}
6444
6445PyObject*
6446PyUnicode_AsLatin1String(PyObject *unicode)
6447{
6448 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
6451/* --- 7-bit ASCII Codec -------------------------------------------------- */
6452
Alexander Belopolsky40018472011-02-26 01:02:56 +00006453PyObject *
6454PyUnicode_DecodeASCII(const char *s,
6455 Py_ssize_t size,
6456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 PyUnicodeObject *v;
6460 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t startinpos;
6462 Py_ssize_t endinpos;
6463 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006465 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 PyObject *errorHandler = NULL;
6467 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006471 if (size == 1 && *(unsigned char*)s < 128)
6472 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6473
6474 /* Fast path. Assume the input actually *is* ASCII, and allocate
6475 a single-block Unicode object with that assumption. If there is
6476 an error, drop the object and start over. */
6477 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6478 if (v == NULL)
6479 goto onError;
6480 d = PyUnicode_1BYTE_DATA(v);
6481 for (i = 0; i < size; i++) {
6482 unsigned char ch = ((unsigned char*)s)[i];
6483 if (ch < 128)
6484 d[i] = ch;
6485 else
6486 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006488 if (i == size)
6489 return (PyObject*)v;
6490 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006491
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 v = _PyUnicode_New(size);
6493 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 e = s + size;
6499 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 register unsigned char c = (unsigned char)*s;
6501 if (c < 128) {
6502 *p++ = c;
6503 ++s;
6504 }
6505 else {
6506 startinpos = s-starts;
6507 endinpos = startinpos + 1;
6508 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6509 if (unicode_decode_call_errorhandler(
6510 errors, &errorHandler,
6511 "ascii", "ordinal not in range(128)",
6512 &starts, &e, &startinpos, &endinpos, &exc, &s,
6513 &v, &outpos, &p))
6514 goto onError;
6515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006517 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006518 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520 Py_XDECREF(errorHandler);
6521 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006522 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006523 Py_DECREF(v);
6524 return NULL;
6525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006527
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006530 Py_XDECREF(errorHandler);
6531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 return NULL;
6533}
6534
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535PyObject *
6536PyUnicode_EncodeASCII(const Py_UNICODE *p,
6537 Py_ssize_t size,
6538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541}
6542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
6546 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 PyErr_BadArgument();
6548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550 if (PyUnicode_READY(unicode) == -1)
6551 return NULL;
6552 /* Fast path: if it is an ASCII-only string, construct bytes object
6553 directly. Else defer to above function to raise the exception. */
6554 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6555 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6556 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006559 errors);
6560}
6561
6562PyObject *
6563PyUnicode_AsASCIIString(PyObject *unicode)
6564{
6565 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566}
6567
Victor Stinner99b95382011-07-04 14:23:54 +02006568#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006569
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006570/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006571
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006572#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573#define NEED_RETRY
6574#endif
6575
6576/* XXX This code is limited to "true" double-byte encodings, as
6577 a) it assumes an incomplete character consists of a single byte, and
6578 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581static int
6582is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583{
6584 const char *curr = s + offset;
6585
6586 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 const char *prev = CharPrev(s, curr);
6588 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589 }
6590 return 0;
6591}
6592
6593/*
6594 * Decode MBCS string into unicode object. If 'final' is set, converts
6595 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6596 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static int
6598decode_mbcs(PyUnicodeObject **v,
6599 const char *s, /* MBCS string */
6600 int size, /* sizeof MBCS string */
6601 int final,
6602 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603{
6604 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006605 Py_ssize_t n;
6606 DWORD usize;
6607 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006608
6609 assert(size >= 0);
6610
Victor Stinner554f3f02010-06-16 23:33:54 +00006611 /* check and handle 'errors' arg */
6612 if (errors==NULL || strcmp(errors, "strict")==0)
6613 flags = MB_ERR_INVALID_CHARS;
6614 else if (strcmp(errors, "ignore")==0)
6615 flags = 0;
6616 else {
6617 PyErr_Format(PyExc_ValueError,
6618 "mbcs encoding does not support errors='%s'",
6619 errors);
6620 return -1;
6621 }
6622
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623 /* Skip trailing lead-byte unless 'final' is set */
6624 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
6627 /* First get the size of the result */
6628 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006629 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6630 if (usize==0)
6631 goto mbcs_decode_error;
6632 } else
6633 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634
6635 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 /* Create unicode object */
6637 *v = _PyUnicode_New(usize);
6638 if (*v == NULL)
6639 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006640 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006641 }
6642 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 /* Extend unicode object */
6644 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006645 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006647 }
6648
6649 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006650 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006652 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6653 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006655 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006657
6658mbcs_decode_error:
6659 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6660 we raise a UnicodeDecodeError - else it is a 'generic'
6661 windows error
6662 */
6663 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6664 /* Ideally, we should get reason from FormatMessage - this
6665 is the Windows 2000 English version of the message
6666 */
6667 PyObject *exc = NULL;
6668 const char *reason = "No mapping for the Unicode character exists "
6669 "in the target multi-byte code page.";
6670 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6671 if (exc != NULL) {
6672 PyCodec_StrictErrors(exc);
6673 Py_DECREF(exc);
6674 }
6675 } else {
6676 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6677 }
6678 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006679}
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
6682PyUnicode_DecodeMBCSStateful(const char *s,
6683 Py_ssize_t size,
6684 const char *errors,
6685 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686{
6687 PyUnicodeObject *v = NULL;
6688 int done;
6689
6690 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692
6693#ifdef NEED_RETRY
6694 retry:
6695 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006696 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 else
6698#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006699 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700
6701 if (done < 0) {
6702 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 }
6705
6706 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708
6709#ifdef NEED_RETRY
6710 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 s += done;
6712 size -= done;
6713 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 }
6715#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006716 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006717 Py_DECREF(v);
6718 return NULL;
6719 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720 return (PyObject *)v;
6721}
6722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_DecodeMBCS(const char *s,
6725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006727{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006728 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6729}
6730
6731/*
6732 * Convert unicode into string object (MBCS).
6733 * Returns 0 if succeed, -1 otherwise.
6734 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006735static int
6736encode_mbcs(PyObject **repr,
6737 const Py_UNICODE *p, /* unicode */
6738 int size, /* size of unicode */
6739 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740{
Victor Stinner554f3f02010-06-16 23:33:54 +00006741 BOOL usedDefaultChar = FALSE;
6742 BOOL *pusedDefaultChar;
6743 int mbcssize;
6744 Py_ssize_t n;
6745 PyObject *exc = NULL;
6746 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006747
6748 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006749
Victor Stinner554f3f02010-06-16 23:33:54 +00006750 /* check and handle 'errors' arg */
6751 if (errors==NULL || strcmp(errors, "strict")==0) {
6752 flags = WC_NO_BEST_FIT_CHARS;
6753 pusedDefaultChar = &usedDefaultChar;
6754 } else if (strcmp(errors, "replace")==0) {
6755 flags = 0;
6756 pusedDefaultChar = NULL;
6757 } else {
6758 PyErr_Format(PyExc_ValueError,
6759 "mbcs encoding does not support errors='%s'",
6760 errors);
6761 return -1;
6762 }
6763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006764 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006766 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6767 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 if (mbcssize == 0) {
6769 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6770 return -1;
6771 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006772 /* If we used a default char, then we failed! */
6773 if (pusedDefaultChar && *pusedDefaultChar)
6774 goto mbcs_encode_error;
6775 } else {
6776 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006777 }
6778
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* Create string object */
6781 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6782 if (*repr == NULL)
6783 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006784 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 }
6786 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 /* Extend string object */
6788 n = PyBytes_Size(*repr);
6789 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6790 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791 }
6792
6793 /* Do the conversion */
6794 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006796 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6797 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6799 return -1;
6800 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006801 if (pusedDefaultChar && *pusedDefaultChar)
6802 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006805
6806mbcs_encode_error:
6807 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6808 Py_XDECREF(exc);
6809 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006810}
6811
Alexander Belopolsky40018472011-02-26 01:02:56 +00006812PyObject *
6813PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6814 Py_ssize_t size,
6815 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817 PyObject *repr = NULL;
6818 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006819
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006823 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 else
6825#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006827
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 Py_XDECREF(repr);
6830 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006831 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832
6833#ifdef NEED_RETRY
6834 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 p += INT_MAX;
6836 size -= INT_MAX;
6837 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006838 }
6839#endif
6840
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006841 return repr;
6842}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006843
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844PyObject *
6845PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006846{
6847 if (!PyUnicode_Check(unicode)) {
6848 PyErr_BadArgument();
6849 return NULL;
6850 }
6851 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 PyUnicode_GET_SIZE(unicode),
6853 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006854}
6855
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856#undef NEED_RETRY
6857
Victor Stinner99b95382011-07-04 14:23:54 +02006858#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860/* --- Character Mapping Codec -------------------------------------------- */
6861
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862PyObject *
6863PyUnicode_DecodeCharmap(const char *s,
6864 Py_ssize_t size,
6865 PyObject *mapping,
6866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006869 Py_ssize_t startinpos;
6870 Py_ssize_t endinpos;
6871 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 PyUnicodeObject *v;
6874 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006875 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 PyObject *errorHandler = NULL;
6877 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006878 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 /* Default to Latin-1 */
6882 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
6885 v = _PyUnicode_New(size);
6886 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006892 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 mapstring = PyUnicode_AS_UNICODE(mapping);
6894 maplen = PyUnicode_GET_SIZE(mapping);
6895 while (s < e) {
6896 unsigned char ch = *s;
6897 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 if (ch < maplen)
6900 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 if (x == 0xfffe) {
6903 /* undefined mapping */
6904 outpos = p-PyUnicode_AS_UNICODE(v);
6905 startinpos = s-starts;
6906 endinpos = startinpos+1;
6907 if (unicode_decode_call_errorhandler(
6908 errors, &errorHandler,
6909 "charmap", "character maps to <undefined>",
6910 &starts, &e, &startinpos, &endinpos, &exc, &s,
6911 &v, &outpos, &p)) {
6912 goto onError;
6913 }
6914 continue;
6915 }
6916 *p++ = x;
6917 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006919 }
6920 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 while (s < e) {
6922 unsigned char ch = *s;
6923 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006924
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6926 w = PyLong_FromLong((long)ch);
6927 if (w == NULL)
6928 goto onError;
6929 x = PyObject_GetItem(mapping, w);
6930 Py_DECREF(w);
6931 if (x == NULL) {
6932 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6933 /* No mapping found means: mapping is undefined. */
6934 PyErr_Clear();
6935 x = Py_None;
6936 Py_INCREF(x);
6937 } else
6938 goto onError;
6939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 /* Apply mapping */
6942 if (PyLong_Check(x)) {
6943 long value = PyLong_AS_LONG(x);
6944 if (value < 0 || value > 65535) {
6945 PyErr_SetString(PyExc_TypeError,
6946 "character mapping must be in range(65536)");
6947 Py_DECREF(x);
6948 goto onError;
6949 }
6950 *p++ = (Py_UNICODE)value;
6951 }
6952 else if (x == Py_None) {
6953 /* undefined mapping */
6954 outpos = p-PyUnicode_AS_UNICODE(v);
6955 startinpos = s-starts;
6956 endinpos = startinpos+1;
6957 if (unicode_decode_call_errorhandler(
6958 errors, &errorHandler,
6959 "charmap", "character maps to <undefined>",
6960 &starts, &e, &startinpos, &endinpos, &exc, &s,
6961 &v, &outpos, &p)) {
6962 Py_DECREF(x);
6963 goto onError;
6964 }
6965 Py_DECREF(x);
6966 continue;
6967 }
6968 else if (PyUnicode_Check(x)) {
6969 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006970
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 if (targetsize == 1)
6972 /* 1-1 mapping */
6973 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 else if (targetsize > 1) {
6976 /* 1-n mapping */
6977 if (targetsize > extrachars) {
6978 /* resize first */
6979 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6980 Py_ssize_t needed = (targetsize - extrachars) + \
6981 (targetsize << 2);
6982 extrachars += needed;
6983 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006984 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 PyUnicode_GET_SIZE(v) + needed) < 0) {
6986 Py_DECREF(x);
6987 goto onError;
6988 }
6989 p = PyUnicode_AS_UNICODE(v) + oldpos;
6990 }
6991 Py_UNICODE_COPY(p,
6992 PyUnicode_AS_UNICODE(x),
6993 targetsize);
6994 p += targetsize;
6995 extrachars -= targetsize;
6996 }
6997 /* 1-0 mapping: skip the character */
6998 }
6999 else {
7000 /* wrong return value */
7001 PyErr_SetString(PyExc_TypeError,
7002 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007003 Py_DECREF(x);
7004 goto onError;
7005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 Py_DECREF(x);
7007 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 }
7010 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007011 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(errorHandler);
7014 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007015 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007016 Py_DECREF(v);
7017 return NULL;
7018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007022 Py_XDECREF(errorHandler);
7023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 Py_XDECREF(v);
7025 return NULL;
7026}
7027
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007028/* Charmap encoding: the lookup table */
7029
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 PyObject_HEAD
7032 unsigned char level1[32];
7033 int count2, count3;
7034 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007035};
7036
7037static PyObject*
7038encoding_map_size(PyObject *obj, PyObject* args)
7039{
7040 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007043}
7044
7045static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007046 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyDoc_STR("Return the size (in bytes) of this object") },
7048 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007049};
7050
7051static void
7052encoding_map_dealloc(PyObject* o)
7053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007054 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007055}
7056
7057static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 "EncodingMap", /*tp_name*/
7060 sizeof(struct encoding_map), /*tp_basicsize*/
7061 0, /*tp_itemsize*/
7062 /* methods */
7063 encoding_map_dealloc, /*tp_dealloc*/
7064 0, /*tp_print*/
7065 0, /*tp_getattr*/
7066 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007067 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 0, /*tp_repr*/
7069 0, /*tp_as_number*/
7070 0, /*tp_as_sequence*/
7071 0, /*tp_as_mapping*/
7072 0, /*tp_hash*/
7073 0, /*tp_call*/
7074 0, /*tp_str*/
7075 0, /*tp_getattro*/
7076 0, /*tp_setattro*/
7077 0, /*tp_as_buffer*/
7078 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7079 0, /*tp_doc*/
7080 0, /*tp_traverse*/
7081 0, /*tp_clear*/
7082 0, /*tp_richcompare*/
7083 0, /*tp_weaklistoffset*/
7084 0, /*tp_iter*/
7085 0, /*tp_iternext*/
7086 encoding_map_methods, /*tp_methods*/
7087 0, /*tp_members*/
7088 0, /*tp_getset*/
7089 0, /*tp_base*/
7090 0, /*tp_dict*/
7091 0, /*tp_descr_get*/
7092 0, /*tp_descr_set*/
7093 0, /*tp_dictoffset*/
7094 0, /*tp_init*/
7095 0, /*tp_alloc*/
7096 0, /*tp_new*/
7097 0, /*tp_free*/
7098 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099};
7100
7101PyObject*
7102PyUnicode_BuildEncodingMap(PyObject* string)
7103{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007104 PyObject *result;
7105 struct encoding_map *mresult;
7106 int i;
7107 int need_dict = 0;
7108 unsigned char level1[32];
7109 unsigned char level2[512];
7110 unsigned char *mlevel1, *mlevel2, *mlevel3;
7111 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 int kind;
7113 void *data;
7114 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007116 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007117 PyErr_BadArgument();
7118 return NULL;
7119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 kind = PyUnicode_KIND(string);
7121 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007122 memset(level1, 0xFF, sizeof level1);
7123 memset(level2, 0xFF, sizeof level2);
7124
7125 /* If there isn't a one-to-one mapping of NULL to \0,
7126 or if there are non-BMP characters, we need to use
7127 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007129 need_dict = 1;
7130 for (i = 1; i < 256; i++) {
7131 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007132 ch = PyUnicode_READ(kind, data, i);
7133 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007134 need_dict = 1;
7135 break;
7136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007137 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007138 /* unmapped character */
7139 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007140 l1 = ch >> 11;
7141 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007142 if (level1[l1] == 0xFF)
7143 level1[l1] = count2++;
7144 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007145 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007146 }
7147
7148 if (count2 >= 0xFF || count3 >= 0xFF)
7149 need_dict = 1;
7150
7151 if (need_dict) {
7152 PyObject *result = PyDict_New();
7153 PyObject *key, *value;
7154 if (!result)
7155 return NULL;
7156 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007157 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007158 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007159 if (!key || !value)
7160 goto failed1;
7161 if (PyDict_SetItem(result, key, value) == -1)
7162 goto failed1;
7163 Py_DECREF(key);
7164 Py_DECREF(value);
7165 }
7166 return result;
7167 failed1:
7168 Py_XDECREF(key);
7169 Py_XDECREF(value);
7170 Py_DECREF(result);
7171 return NULL;
7172 }
7173
7174 /* Create a three-level trie */
7175 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7176 16*count2 + 128*count3 - 1);
7177 if (!result)
7178 return PyErr_NoMemory();
7179 PyObject_Init(result, &EncodingMapType);
7180 mresult = (struct encoding_map*)result;
7181 mresult->count2 = count2;
7182 mresult->count3 = count3;
7183 mlevel1 = mresult->level1;
7184 mlevel2 = mresult->level23;
7185 mlevel3 = mresult->level23 + 16*count2;
7186 memcpy(mlevel1, level1, 32);
7187 memset(mlevel2, 0xFF, 16*count2);
7188 memset(mlevel3, 0, 128*count3);
7189 count3 = 0;
7190 for (i = 1; i < 256; i++) {
7191 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007193 /* unmapped character */
7194 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007195 o1 = PyUnicode_READ(kind, data, i)>>11;
7196 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007197 i2 = 16*mlevel1[o1] + o2;
7198 if (mlevel2[i2] == 0xFF)
7199 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007200 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007201 i3 = 128*mlevel2[i2] + o3;
7202 mlevel3[i3] = i;
7203 }
7204 return result;
7205}
7206
7207static int
7208encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7209{
7210 struct encoding_map *map = (struct encoding_map*)mapping;
7211 int l1 = c>>11;
7212 int l2 = (c>>7) & 0xF;
7213 int l3 = c & 0x7F;
7214 int i;
7215
7216#ifdef Py_UNICODE_WIDE
7217 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007219 }
7220#endif
7221 if (c == 0)
7222 return 0;
7223 /* level 1*/
7224 i = map->level1[l1];
7225 if (i == 0xFF) {
7226 return -1;
7227 }
7228 /* level 2*/
7229 i = map->level23[16*i+l2];
7230 if (i == 0xFF) {
7231 return -1;
7232 }
7233 /* level 3 */
7234 i = map->level23[16*map->count2 + 128*i + l3];
7235 if (i == 0) {
7236 return -1;
7237 }
7238 return i;
7239}
7240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241/* Lookup the character ch in the mapping. If the character
7242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007243 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007244static PyObject *
7245charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
Christian Heimes217cfd12007-12-02 14:31:20 +00007247 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007248 PyObject *x;
7249
7250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 x = PyObject_GetItem(mapping, w);
7253 Py_DECREF(w);
7254 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7256 /* No mapping found means: mapping is undefined. */
7257 PyErr_Clear();
7258 x = Py_None;
7259 Py_INCREF(x);
7260 return x;
7261 } else
7262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007264 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007266 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 long value = PyLong_AS_LONG(x);
7268 if (value < 0 || value > 255) {
7269 PyErr_SetString(PyExc_TypeError,
7270 "character mapping must be in range(256)");
7271 Py_DECREF(x);
7272 return NULL;
7273 }
7274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007276 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 /* wrong return value */
7280 PyErr_Format(PyExc_TypeError,
7281 "character mapping must return integer, bytes or None, not %.400s",
7282 x->ob_type->tp_name);
7283 Py_DECREF(x);
7284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
7286}
7287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007288static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007291 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7292 /* exponentially overallocate to minimize reallocations */
7293 if (requiredsize < 2*outsize)
7294 requiredsize = 2*outsize;
7295 if (_PyBytes_Resize(outobj, requiredsize))
7296 return -1;
7297 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007298}
7299
Benjamin Peterson14339b62009-01-31 16:36:08 +00007300typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007302} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007304 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305 space is available. Return a new reference to the object that
7306 was put in the output buffer, or Py_None, if the mapping was undefined
7307 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007308 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static charmapencode_result
7310charmapencode_output(Py_UNICODE c, PyObject *mapping,
7311 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007313 PyObject *rep;
7314 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316
Christian Heimes90aa7642007-12-19 02:45:37 +00007317 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007318 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007320 if (res == -1)
7321 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 if (outsize<requiredsize)
7323 if (charmapencode_resize(outobj, outpos, requiredsize))
7324 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 outstart[(*outpos)++] = (char)res;
7327 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007328 }
7329
7330 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007331 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007333 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 Py_DECREF(rep);
7335 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007336 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 if (PyLong_Check(rep)) {
7338 Py_ssize_t requiredsize = *outpos+1;
7339 if (outsize<requiredsize)
7340 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7341 Py_DECREF(rep);
7342 return enc_EXCEPTION;
7343 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 else {
7348 const char *repchars = PyBytes_AS_STRING(rep);
7349 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7350 Py_ssize_t requiredsize = *outpos+repsize;
7351 if (outsize<requiredsize)
7352 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7353 Py_DECREF(rep);
7354 return enc_EXCEPTION;
7355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007356 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 memcpy(outstart + *outpos, repchars, repsize);
7358 *outpos += repsize;
7359 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007361 Py_DECREF(rep);
7362 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363}
7364
7365/* handle an error in PyUnicode_EncodeCharmap
7366 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367static int
7368charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007371 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007372 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373{
7374 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007375 Py_ssize_t repsize;
7376 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 Py_UNICODE *uni2;
7378 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379 Py_ssize_t collstartpos = *inpos;
7380 Py_ssize_t collendpos = *inpos+1;
7381 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 char *encoding = "charmap";
7383 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007386 /* find all unencodable characters */
7387 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007388 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007389 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 int res = encoding_map_lookup(p[collendpos], mapping);
7391 if (res != -1)
7392 break;
7393 ++collendpos;
7394 continue;
7395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 rep = charmapencode_lookup(p[collendpos], mapping);
7398 if (rep==NULL)
7399 return -1;
7400 else if (rep!=Py_None) {
7401 Py_DECREF(rep);
7402 break;
7403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007406 }
7407 /* cache callback name lookup
7408 * (if not done yet, i.e. it's the first error) */
7409 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 if ((errors==NULL) || (!strcmp(errors, "strict")))
7411 *known_errorHandler = 1;
7412 else if (!strcmp(errors, "replace"))
7413 *known_errorHandler = 2;
7414 else if (!strcmp(errors, "ignore"))
7415 *known_errorHandler = 3;
7416 else if (!strcmp(errors, "xmlcharrefreplace"))
7417 *known_errorHandler = 4;
7418 else
7419 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007420 }
7421 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422 case 1: /* strict */
7423 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7424 return -1;
7425 case 2: /* replace */
7426 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 x = charmapencode_output('?', mapping, res, respos);
7428 if (x==enc_EXCEPTION) {
7429 return -1;
7430 }
7431 else if (x==enc_FAILED) {
7432 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7433 return -1;
7434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 }
7436 /* fall through */
7437 case 3: /* ignore */
7438 *inpos = collendpos;
7439 break;
7440 case 4: /* xmlcharrefreplace */
7441 /* generate replacement (temporarily (mis)uses p) */
7442 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 char buffer[2+29+1+1];
7444 char *cp;
7445 sprintf(buffer, "&#%d;", (int)p[collpos]);
7446 for (cp = buffer; *cp; ++cp) {
7447 x = charmapencode_output(*cp, mapping, res, respos);
7448 if (x==enc_EXCEPTION)
7449 return -1;
7450 else if (x==enc_FAILED) {
7451 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7452 return -1;
7453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007454 }
7455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007456 *inpos = collendpos;
7457 break;
7458 default:
7459 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 encoding, reason, p, size, exceptionObject,
7461 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007462 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007464 if (PyBytes_Check(repunicode)) {
7465 /* Directly copy bytes result to output. */
7466 Py_ssize_t outsize = PyBytes_Size(*res);
7467 Py_ssize_t requiredsize;
7468 repsize = PyBytes_Size(repunicode);
7469 requiredsize = *respos + repsize;
7470 if (requiredsize > outsize)
7471 /* Make room for all additional bytes. */
7472 if (charmapencode_resize(res, respos, requiredsize)) {
7473 Py_DECREF(repunicode);
7474 return -1;
7475 }
7476 memcpy(PyBytes_AsString(*res) + *respos,
7477 PyBytes_AsString(repunicode), repsize);
7478 *respos += repsize;
7479 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007480 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007481 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 /* generate replacement */
7484 repsize = PyUnicode_GET_SIZE(repunicode);
7485 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 x = charmapencode_output(*uni2, mapping, res, respos);
7487 if (x==enc_EXCEPTION) {
7488 return -1;
7489 }
7490 else if (x==enc_FAILED) {
7491 Py_DECREF(repunicode);
7492 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7493 return -1;
7494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 }
7496 *inpos = newpos;
7497 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498 }
7499 return 0;
7500}
7501
Alexander Belopolsky40018472011-02-26 01:02:56 +00007502PyObject *
7503PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7504 Py_ssize_t size,
7505 PyObject *mapping,
7506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 /* output object */
7509 PyObject *res = NULL;
7510 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007514 PyObject *errorHandler = NULL;
7515 PyObject *exc = NULL;
7516 /* the following variable is used for caching string comparisons
7517 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7518 * 3=ignore, 4=xmlcharrefreplace */
7519 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520
7521 /* Default to Latin-1 */
7522 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007525 /* allocate enough for a simple encoding without
7526 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007527 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528 if (res == NULL)
7529 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007530 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007533 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 /* try to encode it */
7535 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7536 if (x==enc_EXCEPTION) /* error */
7537 goto onError;
7538 if (x==enc_FAILED) { /* unencodable character */
7539 if (charmap_encoding_error(p, size, &inpos, mapping,
7540 &exc,
7541 &known_errorHandler, &errorHandler, errors,
7542 &res, &respos)) {
7543 goto onError;
7544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 else
7547 /* done with this character => adjust input position */
7548 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007552 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007553 if (_PyBytes_Resize(&res, respos) < 0)
7554 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 Py_XDECREF(exc);
7557 Py_XDECREF(errorHandler);
7558 return res;
7559
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 Py_XDECREF(res);
7562 Py_XDECREF(exc);
7563 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565}
7566
Alexander Belopolsky40018472011-02-26 01:02:56 +00007567PyObject *
7568PyUnicode_AsCharmapString(PyObject *unicode,
7569 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570{
7571 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 PyErr_BadArgument();
7573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 }
7575 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 PyUnicode_GET_SIZE(unicode),
7577 mapping,
7578 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579}
7580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007582static void
7583make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007584 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585 Py_ssize_t startpos, Py_ssize_t endpos,
7586 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 *exceptionObject = _PyUnicodeTranslateError_Create(
7590 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 }
7592 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7594 goto onError;
7595 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7596 goto onError;
7597 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7598 goto onError;
7599 return;
7600 onError:
7601 Py_DECREF(*exceptionObject);
7602 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 }
7604}
7605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007606/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007607static void
7608raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007610 Py_ssize_t startpos, Py_ssize_t endpos,
7611 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612{
7613 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617}
7618
7619/* error handling callback helper:
7620 build arguments, call the callback and check the arguments,
7621 put the result into newpos and return the replacement string, which
7622 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007623static PyObject *
7624unicode_translate_call_errorhandler(const char *errors,
7625 PyObject **errorHandler,
7626 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007628 Py_ssize_t startpos, Py_ssize_t endpos,
7629 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007631 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007633 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 PyObject *restuple;
7635 PyObject *resunicode;
7636
7637 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 }
7642
7643 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007644 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647
7648 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007652 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007653 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 Py_DECREF(restuple);
7655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656 }
7657 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 &resunicode, &i_newpos)) {
7659 Py_DECREF(restuple);
7660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007662 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 else
7665 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7668 Py_DECREF(restuple);
7669 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 Py_INCREF(resunicode);
7672 Py_DECREF(restuple);
7673 return resunicode;
7674}
7675
7676/* Lookup the character ch in the mapping and put the result in result,
7677 which must be decrefed by the caller.
7678 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007679static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681{
Christian Heimes217cfd12007-12-02 14:31:20 +00007682 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683 PyObject *x;
7684
7685 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007687 x = PyObject_GetItem(mapping, w);
7688 Py_DECREF(w);
7689 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7691 /* No mapping found means: use 1:1 mapping. */
7692 PyErr_Clear();
7693 *result = NULL;
7694 return 0;
7695 } else
7696 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 }
7698 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 *result = x;
7700 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007702 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 long value = PyLong_AS_LONG(x);
7704 long max = PyUnicode_GetMax();
7705 if (value < 0 || value > max) {
7706 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007707 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 Py_DECREF(x);
7709 return -1;
7710 }
7711 *result = x;
7712 return 0;
7713 }
7714 else if (PyUnicode_Check(x)) {
7715 *result = x;
7716 return 0;
7717 }
7718 else {
7719 /* wrong return value */
7720 PyErr_SetString(PyExc_TypeError,
7721 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 Py_DECREF(x);
7723 return -1;
7724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725}
7726/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 if not reallocate and adjust various state variables.
7728 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007729static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007734 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 /* exponentially overallocate to minimize reallocations */
7736 if (requiredsize < 2 * oldsize)
7737 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7739 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007741 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742 }
7743 return 0;
7744}
7745/* lookup the character, put the result in the output string and adjust
7746 various state variables. Return a new reference to the object that
7747 was put in the output buffer in *result, or Py_None, if the mapping was
7748 undefined (in which case no character was written).
7749 The called must decref result.
7750 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007751static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7753 PyObject *mapping, Py_UCS4 **output,
7754 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007755 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7758 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007762 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 }
7764 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007766 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769 }
7770 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 Py_ssize_t repsize;
7772 if (PyUnicode_READY(*res) == -1)
7773 return -1;
7774 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 if (repsize==1) {
7776 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 }
7779 else if (repsize!=0) {
7780 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007781 Py_ssize_t requiredsize = *opos +
7782 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 Py_ssize_t i;
7785 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 for(i = 0; i < repsize; i++)
7788 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790 }
7791 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007793 return 0;
7794}
7795
Alexander Belopolsky40018472011-02-26 01:02:56 +00007796PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797_PyUnicode_TranslateCharmap(PyObject *input,
7798 PyObject *mapping,
7799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007801 /* input object */
7802 char *idata;
7803 Py_ssize_t size, i;
7804 int kind;
7805 /* output buffer */
7806 Py_UCS4 *output = NULL;
7807 Py_ssize_t osize;
7808 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 char *reason = "character maps to <undefined>";
7812 PyObject *errorHandler = NULL;
7813 PyObject *exc = NULL;
7814 /* the following variable is used for caching string comparisons
7815 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7816 * 3=ignore, 4=xmlcharrefreplace */
7817 int known_errorHandler = -1;
7818
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 PyErr_BadArgument();
7821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 if (PyUnicode_READY(input) == -1)
7825 return NULL;
7826 idata = (char*)PyUnicode_DATA(input);
7827 kind = PyUnicode_KIND(input);
7828 size = PyUnicode_GET_LENGTH(input);
7829 i = 0;
7830
7831 if (size == 0) {
7832 Py_INCREF(input);
7833 return input;
7834 }
7835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836 /* allocate enough for a simple 1:1 translation without
7837 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 osize = size;
7839 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7840 opos = 0;
7841 if (output == NULL) {
7842 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 /* try to encode it */
7848 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849 if (charmaptranslate_output(input, i, mapping,
7850 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 Py_XDECREF(x);
7852 goto onError;
7853 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 else { /* untranslatable character */
7858 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7859 Py_ssize_t repsize;
7860 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 Py_ssize_t collstart = i;
7864 Py_ssize_t collend = i+1;
7865 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 while (collend < size) {
7869 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 goto onError;
7871 Py_XDECREF(x);
7872 if (x!=Py_None)
7873 break;
7874 ++collend;
7875 }
7876 /* cache callback name lookup
7877 * (if not done yet, i.e. it's the first error) */
7878 if (known_errorHandler==-1) {
7879 if ((errors==NULL) || (!strcmp(errors, "strict")))
7880 known_errorHandler = 1;
7881 else if (!strcmp(errors, "replace"))
7882 known_errorHandler = 2;
7883 else if (!strcmp(errors, "ignore"))
7884 known_errorHandler = 3;
7885 else if (!strcmp(errors, "xmlcharrefreplace"))
7886 known_errorHandler = 4;
7887 else
7888 known_errorHandler = 0;
7889 }
7890 switch (known_errorHandler) {
7891 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 raise_translate_exception(&exc, input, collstart,
7893 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 case 2: /* replace */
7896 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 for (coll = collstart; coll<collend; coll++)
7898 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 /* fall through */
7900 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 break;
7903 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 /* generate replacement (temporarily (mis)uses i) */
7905 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 char buffer[2+29+1+1];
7907 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7909 if (charmaptranslate_makespace(&output, &osize,
7910 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 goto onError;
7912 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 break;
7917 default:
7918 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 reason, input, &exc,
7920 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007921 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 goto onError;
7923 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 repsize = PyUnicode_GET_LENGTH(repunicode);
7925 if (charmaptranslate_makespace(&output, &osize,
7926 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 Py_DECREF(repunicode);
7928 goto onError;
7929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 for (uni2 = 0; repsize-->0; ++uni2)
7931 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7932 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 }
7936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7938 if (!res)
7939 goto onError;
7940 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 Py_XDECREF(exc);
7942 Py_XDECREF(errorHandler);
7943 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 Py_XDECREF(exc);
7948 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 return NULL;
7950}
7951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007952/* Deprecated. Use PyUnicode_Translate instead. */
7953PyObject *
7954PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7955 Py_ssize_t size,
7956 PyObject *mapping,
7957 const char *errors)
7958{
7959 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7960 if (!unicode)
7961 return NULL;
7962 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7963}
7964
Alexander Belopolsky40018472011-02-26 01:02:56 +00007965PyObject *
7966PyUnicode_Translate(PyObject *str,
7967 PyObject *mapping,
7968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969{
7970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 str = PyUnicode_FromObject(str);
7973 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 Py_DECREF(str);
7977 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007978
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 Py_XDECREF(str);
7981 return NULL;
7982}
Tim Petersced69f82003-09-16 20:30:58 +00007983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984static Py_UCS4
7985fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7986{
7987 /* No need to call PyUnicode_READY(self) because this function is only
7988 called as a callback from fixup() which does it already. */
7989 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7990 const int kind = PyUnicode_KIND(self);
7991 void *data = PyUnicode_DATA(self);
7992 Py_UCS4 maxchar = 0, ch, fixed;
7993 Py_ssize_t i;
7994
7995 for (i = 0; i < len; ++i) {
7996 ch = PyUnicode_READ(kind, data, i);
7997 fixed = 0;
7998 if (ch > 127) {
7999 if (Py_UNICODE_ISSPACE(ch))
8000 fixed = ' ';
8001 else {
8002 const int decimal = Py_UNICODE_TODECIMAL(ch);
8003 if (decimal >= 0)
8004 fixed = '0' + decimal;
8005 }
8006 if (fixed != 0) {
8007 if (fixed > maxchar)
8008 maxchar = fixed;
8009 PyUnicode_WRITE(kind, data, i, fixed);
8010 }
8011 else if (ch > maxchar)
8012 maxchar = ch;
8013 }
8014 else if (ch > maxchar)
8015 maxchar = ch;
8016 }
8017
8018 return maxchar;
8019}
8020
8021PyObject *
8022_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8023{
8024 if (!PyUnicode_Check(unicode)) {
8025 PyErr_BadInternalCall();
8026 return NULL;
8027 }
8028 if (PyUnicode_READY(unicode) == -1)
8029 return NULL;
8030 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8031 /* If the string is already ASCII, just return the same string */
8032 Py_INCREF(unicode);
8033 return unicode;
8034 }
8035 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8036}
8037
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008038PyObject *
8039PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8040 Py_ssize_t length)
8041{
8042 PyObject *result;
8043 Py_UNICODE *p; /* write pointer into result */
8044 Py_ssize_t i;
8045 /* Copy to a new string */
8046 result = (PyObject *)_PyUnicode_New(length);
8047 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8048 if (result == NULL)
8049 return result;
8050 p = PyUnicode_AS_UNICODE(result);
8051 /* Iterate over code points */
8052 for (i = 0; i < length; i++) {
8053 Py_UNICODE ch =s[i];
8054 if (ch > 127) {
8055 int decimal = Py_UNICODE_TODECIMAL(ch);
8056 if (decimal >= 0)
8057 p[i] = '0' + decimal;
8058 }
8059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008060 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8061 Py_DECREF(result);
8062 return NULL;
8063 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008064 return result;
8065}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008066/* --- Decimal Encoder ---------------------------------------------------- */
8067
Alexander Belopolsky40018472011-02-26 01:02:56 +00008068int
8069PyUnicode_EncodeDecimal(Py_UNICODE *s,
8070 Py_ssize_t length,
8071 char *output,
8072 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008073{
8074 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 PyObject *errorHandler = NULL;
8076 PyObject *exc = NULL;
8077 const char *encoding = "decimal";
8078 const char *reason = "invalid decimal Unicode string";
8079 /* the following variable is used for caching string comparisons
8080 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8081 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008082
8083 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 PyErr_BadArgument();
8085 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008086 }
8087
8088 p = s;
8089 end = s + length;
8090 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 register Py_UNICODE ch = *p;
8092 int decimal;
8093 PyObject *repunicode;
8094 Py_ssize_t repsize;
8095 Py_ssize_t newpos;
8096 Py_UNICODE *uni2;
8097 Py_UNICODE *collstart;
8098 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008099
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 ++p;
8103 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 decimal = Py_UNICODE_TODECIMAL(ch);
8106 if (decimal >= 0) {
8107 *output++ = '0' + decimal;
8108 ++p;
8109 continue;
8110 }
8111 if (0 < ch && ch < 256) {
8112 *output++ = (char)ch;
8113 ++p;
8114 continue;
8115 }
8116 /* All other characters are considered unencodable */
8117 collstart = p;
8118 collend = p+1;
8119 while (collend < end) {
8120 if ((0 < *collend && *collend < 256) ||
8121 !Py_UNICODE_ISSPACE(*collend) ||
8122 Py_UNICODE_TODECIMAL(*collend))
8123 break;
8124 }
8125 /* cache callback name lookup
8126 * (if not done yet, i.e. it's the first error) */
8127 if (known_errorHandler==-1) {
8128 if ((errors==NULL) || (!strcmp(errors, "strict")))
8129 known_errorHandler = 1;
8130 else if (!strcmp(errors, "replace"))
8131 known_errorHandler = 2;
8132 else if (!strcmp(errors, "ignore"))
8133 known_errorHandler = 3;
8134 else if (!strcmp(errors, "xmlcharrefreplace"))
8135 known_errorHandler = 4;
8136 else
8137 known_errorHandler = 0;
8138 }
8139 switch (known_errorHandler) {
8140 case 1: /* strict */
8141 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8142 goto onError;
8143 case 2: /* replace */
8144 for (p = collstart; p < collend; ++p)
8145 *output++ = '?';
8146 /* fall through */
8147 case 3: /* ignore */
8148 p = collend;
8149 break;
8150 case 4: /* xmlcharrefreplace */
8151 /* generate replacement (temporarily (mis)uses p) */
8152 for (p = collstart; p < collend; ++p)
8153 output += sprintf(output, "&#%d;", (int)*p);
8154 p = collend;
8155 break;
8156 default:
8157 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8158 encoding, reason, s, length, &exc,
8159 collstart-s, collend-s, &newpos);
8160 if (repunicode == NULL)
8161 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008162 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008163 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008164 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8165 Py_DECREF(repunicode);
8166 goto onError;
8167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 /* generate replacement */
8169 repsize = PyUnicode_GET_SIZE(repunicode);
8170 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8171 Py_UNICODE ch = *uni2;
8172 if (Py_UNICODE_ISSPACE(ch))
8173 *output++ = ' ';
8174 else {
8175 decimal = Py_UNICODE_TODECIMAL(ch);
8176 if (decimal >= 0)
8177 *output++ = '0' + decimal;
8178 else if (0 < ch && ch < 256)
8179 *output++ = (char)ch;
8180 else {
8181 Py_DECREF(repunicode);
8182 raise_encode_exception(&exc, encoding,
8183 s, length, collstart-s, collend-s, reason);
8184 goto onError;
8185 }
8186 }
8187 }
8188 p = s + newpos;
8189 Py_DECREF(repunicode);
8190 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008191 }
8192 /* 0-terminate the output string */
8193 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 Py_XDECREF(exc);
8195 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008196 return 0;
8197
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 Py_XDECREF(exc);
8200 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008201 return -1;
8202}
8203
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204/* --- Helpers ------------------------------------------------------------ */
8205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206#include "stringlib/ucs1lib.h"
8207#include "stringlib/fastsearch.h"
8208#include "stringlib/partition.h"
8209#include "stringlib/split.h"
8210#include "stringlib/count.h"
8211#include "stringlib/find.h"
8212#include "stringlib/localeutil.h"
8213#include "stringlib/undef.h"
8214
8215#include "stringlib/ucs2lib.h"
8216#include "stringlib/fastsearch.h"
8217#include "stringlib/partition.h"
8218#include "stringlib/split.h"
8219#include "stringlib/count.h"
8220#include "stringlib/find.h"
8221#include "stringlib/localeutil.h"
8222#include "stringlib/undef.h"
8223
8224#include "stringlib/ucs4lib.h"
8225#include "stringlib/fastsearch.h"
8226#include "stringlib/partition.h"
8227#include "stringlib/split.h"
8228#include "stringlib/count.h"
8229#include "stringlib/find.h"
8230#include "stringlib/localeutil.h"
8231#include "stringlib/undef.h"
8232
8233static Py_ssize_t
8234any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8235 const Py_UCS1*, Py_ssize_t,
8236 Py_ssize_t, Py_ssize_t),
8237 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8238 const Py_UCS2*, Py_ssize_t,
8239 Py_ssize_t, Py_ssize_t),
8240 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8241 const Py_UCS4*, Py_ssize_t,
8242 Py_ssize_t, Py_ssize_t),
8243 PyObject* s1, PyObject* s2,
8244 Py_ssize_t start,
8245 Py_ssize_t end)
8246{
8247 int kind1, kind2, kind;
8248 void *buf1, *buf2;
8249 Py_ssize_t len1, len2, result;
8250
8251 kind1 = PyUnicode_KIND(s1);
8252 kind2 = PyUnicode_KIND(s2);
8253 kind = kind1 > kind2 ? kind1 : kind2;
8254 buf1 = PyUnicode_DATA(s1);
8255 buf2 = PyUnicode_DATA(s2);
8256 if (kind1 != kind)
8257 buf1 = _PyUnicode_AsKind(s1, kind);
8258 if (!buf1)
8259 return -2;
8260 if (kind2 != kind)
8261 buf2 = _PyUnicode_AsKind(s2, kind);
8262 if (!buf2) {
8263 if (kind1 != kind) PyMem_Free(buf1);
8264 return -2;
8265 }
8266 len1 = PyUnicode_GET_LENGTH(s1);
8267 len2 = PyUnicode_GET_LENGTH(s2);
8268
8269 switch(kind) {
8270 case PyUnicode_1BYTE_KIND:
8271 result = ucs1(buf1, len1, buf2, len2, start, end);
8272 break;
8273 case PyUnicode_2BYTE_KIND:
8274 result = ucs2(buf1, len1, buf2, len2, start, end);
8275 break;
8276 case PyUnicode_4BYTE_KIND:
8277 result = ucs4(buf1, len1, buf2, len2, start, end);
8278 break;
8279 default:
8280 assert(0); result = -2;
8281 }
8282
8283 if (kind1 != kind)
8284 PyMem_Free(buf1);
8285 if (kind2 != kind)
8286 PyMem_Free(buf2);
8287
8288 return result;
8289}
8290
8291Py_ssize_t
8292_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8293 Py_ssize_t n_buffer,
8294 void *digits, Py_ssize_t n_digits,
8295 Py_ssize_t min_width,
8296 const char *grouping,
8297 const char *thousands_sep)
8298{
8299 switch(kind) {
8300 case PyUnicode_1BYTE_KIND:
8301 return _PyUnicode_ucs1_InsertThousandsGrouping(
8302 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8303 min_width, grouping, thousands_sep);
8304 case PyUnicode_2BYTE_KIND:
8305 return _PyUnicode_ucs2_InsertThousandsGrouping(
8306 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8307 min_width, grouping, thousands_sep);
8308 case PyUnicode_4BYTE_KIND:
8309 return _PyUnicode_ucs4_InsertThousandsGrouping(
8310 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8311 min_width, grouping, thousands_sep);
8312 }
8313 assert(0);
8314 return -1;
8315}
8316
8317
Eric Smith8c663262007-08-25 02:26:07 +00008318#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008320
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321#include "stringlib/count.h"
8322#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008323
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008325#define ADJUST_INDICES(start, end, len) \
8326 if (end > len) \
8327 end = len; \
8328 else if (end < 0) { \
8329 end += len; \
8330 if (end < 0) \
8331 end = 0; \
8332 } \
8333 if (start < 0) { \
8334 start += len; \
8335 if (start < 0) \
8336 start = 0; \
8337 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339Py_ssize_t
8340PyUnicode_Count(PyObject *str,
8341 PyObject *substr,
8342 Py_ssize_t start,
8343 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346 PyUnicodeObject* str_obj;
8347 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 int kind1, kind2, kind;
8349 void *buf1 = NULL, *buf2 = NULL;
8350 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008351
Thomas Wouters477c8d52006-05-27 19:21:47 +00008352 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008356 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 Py_DECREF(str_obj);
8358 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
Tim Petersced69f82003-09-16 20:30:58 +00008360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 kind1 = PyUnicode_KIND(str_obj);
8362 kind2 = PyUnicode_KIND(sub_obj);
8363 kind = kind1 > kind2 ? kind1 : kind2;
8364 buf1 = PyUnicode_DATA(str_obj);
8365 if (kind1 != kind)
8366 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8367 if (!buf1)
8368 goto onError;
8369 buf2 = PyUnicode_DATA(sub_obj);
8370 if (kind2 != kind)
8371 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8372 if (!buf2)
8373 goto onError;
8374 len1 = PyUnicode_GET_LENGTH(str_obj);
8375 len2 = PyUnicode_GET_LENGTH(sub_obj);
8376
8377 ADJUST_INDICES(start, end, len1);
8378 switch(kind) {
8379 case PyUnicode_1BYTE_KIND:
8380 result = ucs1lib_count(
8381 ((Py_UCS1*)buf1) + start, end - start,
8382 buf2, len2, PY_SSIZE_T_MAX
8383 );
8384 break;
8385 case PyUnicode_2BYTE_KIND:
8386 result = ucs2lib_count(
8387 ((Py_UCS2*)buf1) + start, end - start,
8388 buf2, len2, PY_SSIZE_T_MAX
8389 );
8390 break;
8391 case PyUnicode_4BYTE_KIND:
8392 result = ucs4lib_count(
8393 ((Py_UCS4*)buf1) + start, end - start,
8394 buf2, len2, PY_SSIZE_T_MAX
8395 );
8396 break;
8397 default:
8398 assert(0); result = 0;
8399 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008400
8401 Py_DECREF(sub_obj);
8402 Py_DECREF(str_obj);
8403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 if (kind1 != kind)
8405 PyMem_Free(buf1);
8406 if (kind2 != kind)
8407 PyMem_Free(buf2);
8408
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 onError:
8411 Py_DECREF(sub_obj);
8412 Py_DECREF(str_obj);
8413 if (kind1 != kind && buf1)
8414 PyMem_Free(buf1);
8415 if (kind2 != kind && buf2)
8416 PyMem_Free(buf2);
8417 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418}
8419
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420Py_ssize_t
8421PyUnicode_Find(PyObject *str,
8422 PyObject *sub,
8423 Py_ssize_t start,
8424 Py_ssize_t end,
8425 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008432 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 Py_DECREF(str);
8435 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 }
Tim Petersced69f82003-09-16 20:30:58 +00008437
Thomas Wouters477c8d52006-05-27 19:21:47 +00008438 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 result = any_find_slice(
8440 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8441 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008442 );
8443 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 result = any_find_slice(
8445 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8446 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008447 );
8448
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008450 Py_DECREF(sub);
8451
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 return result;
8453}
8454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455Py_ssize_t
8456PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8457 Py_ssize_t start, Py_ssize_t end,
8458 int direction)
8459{
8460 char *result;
8461 int kind;
8462 if (PyUnicode_READY(str) == -1)
8463 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008464 if (start < 0 || end < 0) {
8465 PyErr_SetString(PyExc_IndexError, "string index out of range");
8466 return -2;
8467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 if (end > PyUnicode_GET_LENGTH(str))
8469 end = PyUnicode_GET_LENGTH(str);
8470 kind = PyUnicode_KIND(str);
8471 result = findchar(PyUnicode_1BYTE_DATA(str)
8472 + PyUnicode_KIND_SIZE(kind, start),
8473 kind,
8474 end-start, ch, direction);
8475 if (!result)
8476 return -1;
8477 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8478}
8479
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480static int
8481tailmatch(PyUnicodeObject *self,
8482 PyUnicodeObject *substring,
8483 Py_ssize_t start,
8484 Py_ssize_t end,
8485 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 int kind_self;
8488 int kind_sub;
8489 void *data_self;
8490 void *data_sub;
8491 Py_ssize_t offset;
8492 Py_ssize_t i;
8493 Py_ssize_t end_sub;
8494
8495 if (PyUnicode_READY(self) == -1 ||
8496 PyUnicode_READY(substring) == -1)
8497 return 0;
8498
8499 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 return 1;
8501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8503 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 kind_self = PyUnicode_KIND(self);
8508 data_self = PyUnicode_DATA(self);
8509 kind_sub = PyUnicode_KIND(substring);
8510 data_sub = PyUnicode_DATA(substring);
8511 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8512
8513 if (direction > 0)
8514 offset = end;
8515 else
8516 offset = start;
8517
8518 if (PyUnicode_READ(kind_self, data_self, offset) ==
8519 PyUnicode_READ(kind_sub, data_sub, 0) &&
8520 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8521 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8522 /* If both are of the same kind, memcmp is sufficient */
8523 if (kind_self == kind_sub) {
8524 return ! memcmp((char *)data_self +
8525 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8526 data_sub,
8527 PyUnicode_GET_LENGTH(substring) *
8528 PyUnicode_CHARACTER_SIZE(substring));
8529 }
8530 /* otherwise we have to compare each character by first accesing it */
8531 else {
8532 /* We do not need to compare 0 and len(substring)-1 because
8533 the if statement above ensured already that they are equal
8534 when we end up here. */
8535 // TODO: honor direction and do a forward or backwards search
8536 for (i = 1; i < end_sub; ++i) {
8537 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8538 PyUnicode_READ(kind_sub, data_sub, i))
8539 return 0;
8540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 }
8544
8545 return 0;
8546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548Py_ssize_t
8549PyUnicode_Tailmatch(PyObject *str,
8550 PyObject *substr,
8551 Py_ssize_t start,
8552 Py_ssize_t end,
8553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008555 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 str = PyUnicode_FromObject(str);
8558 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560 substr = PyUnicode_FromObject(substr);
8561 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 Py_DECREF(str);
8563 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Tim Petersced69f82003-09-16 20:30:58 +00008565
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 (PyUnicodeObject *)substr,
8568 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 Py_DECREF(str);
8570 Py_DECREF(substr);
8571 return result;
8572}
8573
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574/* Apply fixfct filter to the Unicode object self and return a
8575 reference to the modified object */
8576
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577static PyObject *
8578fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 PyObject *u;
8582 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 if (PyUnicode_READY(self) == -1)
8585 return NULL;
8586 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8587 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8588 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8593 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 /* fix functions return the new maximum character in a string,
8596 if the kind of the resulting unicode object does not change,
8597 everything is fine. Otherwise we need to change the string kind
8598 and re-run the fix function. */
8599 maxchar_new = fixfct((PyUnicodeObject*)u);
8600 if (maxchar_new == 0)
8601 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8602 else if (maxchar_new <= 127)
8603 maxchar_new = 127;
8604 else if (maxchar_new <= 255)
8605 maxchar_new = 255;
8606 else if (maxchar_new <= 65535)
8607 maxchar_new = 65535;
8608 else
8609 maxchar_new = 1114111; /* 0x10ffff */
8610
8611 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 /* fixfct should return TRUE if it modified the buffer. If
8613 FALSE, return a reference to the original buffer instead
8614 (to save space, not time) */
8615 Py_INCREF(self);
8616 Py_DECREF(u);
8617 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 else if (maxchar_new == maxchar_old) {
8620 return u;
8621 }
8622 else {
8623 /* In case the maximum character changed, we need to
8624 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008625 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (v == NULL) {
8627 Py_DECREF(u);
8628 return NULL;
8629 }
8630 if (maxchar_new > maxchar_old) {
8631 /* If the maxchar increased so that the kind changed, not all
8632 characters are representable anymore and we need to fix the
8633 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008634 if (PyUnicode_CopyCharacters(v, 0,
8635 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008636 PyUnicode_GET_LENGTH(self)) < 0)
8637 {
8638 Py_DECREF(u);
8639 return NULL;
8640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 maxchar_old = fixfct((PyUnicodeObject*)v);
8642 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8643 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008644 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008645 if (PyUnicode_CopyCharacters(v, 0,
8646 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008647 PyUnicode_GET_LENGTH(self)) < 0)
8648 {
8649 Py_DECREF(u);
8650 return NULL;
8651 }
8652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653
8654 Py_DECREF(u);
8655 return v;
8656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 /* No need to call PyUnicode_READY(self) because this function is only
8663 called as a callback from fixup() which does it already. */
8664 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8665 const int kind = PyUnicode_KIND(self);
8666 void *data = PyUnicode_DATA(self);
8667 int touched = 0;
8668 Py_UCS4 maxchar = 0;
8669 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 for (i = 0; i < len; ++i) {
8672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8673 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8674 if (up != ch) {
8675 if (up > maxchar)
8676 maxchar = up;
8677 PyUnicode_WRITE(kind, data, i, up);
8678 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 else if (ch > maxchar)
8681 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 if (touched)
8685 return maxchar;
8686 else
8687 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688}
8689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8694 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8695 const int kind = PyUnicode_KIND(self);
8696 void *data = PyUnicode_DATA(self);
8697 int touched = 0;
8698 Py_UCS4 maxchar = 0;
8699 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 for(i = 0; i < len; ++i) {
8702 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8703 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8704 if (lo != ch) {
8705 if (lo > maxchar)
8706 maxchar = lo;
8707 PyUnicode_WRITE(kind, data, i, lo);
8708 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 else if (ch > maxchar)
8711 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 }
8713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 if (touched)
8715 return maxchar;
8716 else
8717 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718}
8719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008721fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8724 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8725 const int kind = PyUnicode_KIND(self);
8726 void *data = PyUnicode_DATA(self);
8727 int touched = 0;
8728 Py_UCS4 maxchar = 0;
8729 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 for(i = 0; i < len; ++i) {
8732 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8733 Py_UCS4 nu = 0;
8734
8735 if (Py_UNICODE_ISUPPER(ch))
8736 nu = Py_UNICODE_TOLOWER(ch);
8737 else if (Py_UNICODE_ISLOWER(ch))
8738 nu = Py_UNICODE_TOUPPER(ch);
8739
8740 if (nu != 0) {
8741 if (nu > maxchar)
8742 maxchar = nu;
8743 PyUnicode_WRITE(kind, data, i, nu);
8744 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 else if (ch > maxchar)
8747 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
8749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (touched)
8751 return maxchar;
8752 else
8753 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754}
8755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8760 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8761 const int kind = PyUnicode_KIND(self);
8762 void *data = PyUnicode_DATA(self);
8763 int touched = 0;
8764 Py_UCS4 maxchar = 0;
8765 Py_ssize_t i = 0;
8766 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008767
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008768 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770
8771 ch = PyUnicode_READ(kind, data, i);
8772 if (!Py_UNICODE_ISUPPER(ch)) {
8773 maxchar = Py_UNICODE_TOUPPER(ch);
8774 PyUnicode_WRITE(kind, data, i, maxchar);
8775 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 ++i;
8778 for(; i < len; ++i) {
8779 ch = PyUnicode_READ(kind, data, i);
8780 if (!Py_UNICODE_ISLOWER(ch)) {
8781 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8782 if (lo > maxchar)
8783 maxchar = lo;
8784 PyUnicode_WRITE(kind, data, i, lo);
8785 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 else if (ch > maxchar)
8788 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790
8791 if (touched)
8792 return maxchar;
8793 else
8794 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795}
8796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008798fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8801 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8802 const int kind = PyUnicode_KIND(self);
8803 void *data = PyUnicode_DATA(self);
8804 Py_UCS4 maxchar = 0;
8805 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 int previous_is_cased;
8807
8808 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 if (len == 1) {
8810 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8811 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8812 if (ti != ch) {
8813 PyUnicode_WRITE(kind, data, i, ti);
8814 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 }
8816 else
8817 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 for(; i < len; ++i) {
8821 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8822 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008823
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 nu = Py_UNICODE_TOTITLE(ch);
8828
8829 if (nu > maxchar)
8830 maxchar = nu;
8831 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008832
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 if (Py_UNICODE_ISLOWER(ch) ||
8834 Py_UNICODE_ISUPPER(ch) ||
8835 Py_UNICODE_ISTITLE(ch))
8836 previous_is_cased = 1;
8837 else
8838 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
Tim Peters8ce9f162004-08-27 01:49:32 +00008843PyObject *
8844PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008847 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008850 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8851 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008852 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 Py_ssize_t sz, i, res_offset;
8854 Py_UCS4 maxchar = 0;
8855 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856
Tim Peters05eba1f2004-08-27 21:32:02 +00008857 fseq = PySequence_Fast(seq, "");
8858 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008859 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008860 }
8861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008862 /* NOTE: the following code can't call back into Python code,
8863 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008864 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008865
Tim Peters05eba1f2004-08-27 21:32:02 +00008866 seqlen = PySequence_Fast_GET_SIZE(fseq);
8867 /* If empty sequence, return u"". */
8868 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008871 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008872 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008873 /* If singleton sequence with an exact Unicode, return that. */
8874 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 item = items[0];
8876 if (PyUnicode_CheckExact(item)) {
8877 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 goto Done;
8880 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008881 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008882 else {
8883 /* Set up sep and seplen */
8884 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 /* fall back to a blank space separator */
8886 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008887 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008889 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008890 else {
8891 if (!PyUnicode_Check(separator)) {
8892 PyErr_Format(PyExc_TypeError,
8893 "separator: expected str instance,"
8894 " %.80s found",
8895 Py_TYPE(separator)->tp_name);
8896 goto onError;
8897 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008898 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 goto onError;
8900 sep = separator;
8901 seplen = PyUnicode_GET_LENGTH(separator);
8902 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8903 /* inc refcount to keep this code path symetric with the
8904 above case of a blank separator */
8905 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008906 }
8907 }
8908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008909 /* There are at least two things to join, or else we have a subclass
8910 * of str in the sequence.
8911 * Do a pre-pass to figure out the total amount of space we'll
8912 * need (sz), and see whether all argument are strings.
8913 */
8914 sz = 0;
8915 for (i = 0; i < seqlen; i++) {
8916 const Py_ssize_t old_sz = sz;
8917 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 if (!PyUnicode_Check(item)) {
8919 PyErr_Format(PyExc_TypeError,
8920 "sequence item %zd: expected str instance,"
8921 " %.80s found",
8922 i, Py_TYPE(item)->tp_name);
8923 goto onError;
8924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 if (PyUnicode_READY(item) == -1)
8926 goto onError;
8927 sz += PyUnicode_GET_LENGTH(item);
8928 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8929 if (item_maxchar > maxchar)
8930 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008931 if (i != 0)
8932 sz += seplen;
8933 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8934 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008936 goto onError;
8937 }
8938 }
Tim Petersced69f82003-09-16 20:30:58 +00008939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008941 if (res == NULL)
8942 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008943
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008944 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008946 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008947 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008949 if (i && seplen != 0) {
8950 copied = PyUnicode_CopyCharacters(res, res_offset,
8951 sep, 0, seplen);
8952 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008953 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008954#ifdef Py_DEBUG
8955 res_offset += copied;
8956#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008958#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008960 itemlen = PyUnicode_GET_LENGTH(item);
8961 if (itemlen != 0) {
8962 copied = PyUnicode_CopyCharacters(res, res_offset,
8963 item, 0, itemlen);
8964 if (copied < 0)
8965 goto onError;
8966#ifdef Py_DEBUG
8967 res_offset += copied;
8968#else
8969 res_offset += itemlen;
8970#endif
8971 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008974
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008976 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 Py_XDECREF(sep);
8978 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008981 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008983 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 return NULL;
8985}
8986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987#define FILL(kind, data, value, start, length) \
8988 do { \
8989 Py_ssize_t i_ = 0; \
8990 assert(kind != PyUnicode_WCHAR_KIND); \
8991 switch ((kind)) { \
8992 case PyUnicode_1BYTE_KIND: { \
8993 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8994 memset(to_, (unsigned char)value, length); \
8995 break; \
8996 } \
8997 case PyUnicode_2BYTE_KIND: { \
8998 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8999 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9000 break; \
9001 } \
9002 default: { \
9003 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9004 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9005 break; \
9006 } \
9007 } \
9008 } while (0)
9009
Alexander Belopolsky40018472011-02-26 01:02:56 +00009010static PyUnicodeObject *
9011pad(PyUnicodeObject *self,
9012 Py_ssize_t left,
9013 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 PyObject *u;
9017 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009018 int kind;
9019 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
9021 if (left < 0)
9022 left = 0;
9023 if (right < 0)
9024 right = 0;
9025
Tim Peters7a29bd52001-09-12 03:03:31 +00009026 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 Py_INCREF(self);
9028 return self;
9029 }
9030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9032 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009033 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9034 return NULL;
9035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9037 if (fill > maxchar)
9038 maxchar = fill;
9039 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009040 if (!u)
9041 return NULL;
9042
9043 kind = PyUnicode_KIND(u);
9044 data = PyUnicode_DATA(u);
9045 if (left)
9046 FILL(kind, data, fill, 0, left);
9047 if (right)
9048 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009049 if (PyUnicode_CopyCharacters(u, left,
9050 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009051 _PyUnicode_LENGTH(self)) < 0)
9052 {
9053 Py_DECREF(u);
9054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 }
9056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Alexander Belopolsky40018472011-02-26 01:02:56 +00009061PyObject *
9062PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065
9066 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 switch(PyUnicode_KIND(string)) {
9071 case PyUnicode_1BYTE_KIND:
9072 list = ucs1lib_splitlines(
9073 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9074 PyUnicode_GET_LENGTH(string), keepends);
9075 break;
9076 case PyUnicode_2BYTE_KIND:
9077 list = ucs2lib_splitlines(
9078 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9079 PyUnicode_GET_LENGTH(string), keepends);
9080 break;
9081 case PyUnicode_4BYTE_KIND:
9082 list = ucs4lib_splitlines(
9083 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9084 PyUnicode_GET_LENGTH(string), keepends);
9085 break;
9086 default:
9087 assert(0);
9088 list = 0;
9089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 Py_DECREF(string);
9091 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
9093
Alexander Belopolsky40018472011-02-26 01:02:56 +00009094static PyObject *
9095split(PyUnicodeObject *self,
9096 PyUnicodeObject *substring,
9097 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 int kind1, kind2, kind;
9100 void *buf1, *buf2;
9101 Py_ssize_t len1, len2;
9102 PyObject* out;
9103
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009105 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 if (PyUnicode_READY(self) == -1)
9108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 if (substring == NULL)
9111 switch(PyUnicode_KIND(self)) {
9112 case PyUnicode_1BYTE_KIND:
9113 return ucs1lib_split_whitespace(
9114 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9115 PyUnicode_GET_LENGTH(self), maxcount
9116 );
9117 case PyUnicode_2BYTE_KIND:
9118 return ucs2lib_split_whitespace(
9119 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9120 PyUnicode_GET_LENGTH(self), maxcount
9121 );
9122 case PyUnicode_4BYTE_KIND:
9123 return ucs4lib_split_whitespace(
9124 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9125 PyUnicode_GET_LENGTH(self), maxcount
9126 );
9127 default:
9128 assert(0);
9129 return NULL;
9130 }
9131
9132 if (PyUnicode_READY(substring) == -1)
9133 return NULL;
9134
9135 kind1 = PyUnicode_KIND(self);
9136 kind2 = PyUnicode_KIND(substring);
9137 kind = kind1 > kind2 ? kind1 : kind2;
9138 buf1 = PyUnicode_DATA(self);
9139 buf2 = PyUnicode_DATA(substring);
9140 if (kind1 != kind)
9141 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9142 if (!buf1)
9143 return NULL;
9144 if (kind2 != kind)
9145 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9146 if (!buf2) {
9147 if (kind1 != kind) PyMem_Free(buf1);
9148 return NULL;
9149 }
9150 len1 = PyUnicode_GET_LENGTH(self);
9151 len2 = PyUnicode_GET_LENGTH(substring);
9152
9153 switch(kind) {
9154 case PyUnicode_1BYTE_KIND:
9155 out = ucs1lib_split(
9156 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9157 break;
9158 case PyUnicode_2BYTE_KIND:
9159 out = ucs2lib_split(
9160 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9161 break;
9162 case PyUnicode_4BYTE_KIND:
9163 out = ucs4lib_split(
9164 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9165 break;
9166 default:
9167 out = NULL;
9168 }
9169 if (kind1 != kind)
9170 PyMem_Free(buf1);
9171 if (kind2 != kind)
9172 PyMem_Free(buf2);
9173 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174}
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
9177rsplit(PyUnicodeObject *self,
9178 PyUnicodeObject *substring,
9179 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 int kind1, kind2, kind;
9182 void *buf1, *buf2;
9183 Py_ssize_t len1, len2;
9184 PyObject* out;
9185
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009186 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009187 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (PyUnicode_READY(self) == -1)
9190 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 if (substring == NULL)
9193 switch(PyUnicode_KIND(self)) {
9194 case PyUnicode_1BYTE_KIND:
9195 return ucs1lib_rsplit_whitespace(
9196 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9197 PyUnicode_GET_LENGTH(self), maxcount
9198 );
9199 case PyUnicode_2BYTE_KIND:
9200 return ucs2lib_rsplit_whitespace(
9201 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9202 PyUnicode_GET_LENGTH(self), maxcount
9203 );
9204 case PyUnicode_4BYTE_KIND:
9205 return ucs4lib_rsplit_whitespace(
9206 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9207 PyUnicode_GET_LENGTH(self), maxcount
9208 );
9209 default:
9210 assert(0);
9211 return NULL;
9212 }
9213
9214 if (PyUnicode_READY(substring) == -1)
9215 return NULL;
9216
9217 kind1 = PyUnicode_KIND(self);
9218 kind2 = PyUnicode_KIND(substring);
9219 kind = kind1 > kind2 ? kind1 : kind2;
9220 buf1 = PyUnicode_DATA(self);
9221 buf2 = PyUnicode_DATA(substring);
9222 if (kind1 != kind)
9223 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9224 if (!buf1)
9225 return NULL;
9226 if (kind2 != kind)
9227 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9228 if (!buf2) {
9229 if (kind1 != kind) PyMem_Free(buf1);
9230 return NULL;
9231 }
9232 len1 = PyUnicode_GET_LENGTH(self);
9233 len2 = PyUnicode_GET_LENGTH(substring);
9234
9235 switch(kind) {
9236 case PyUnicode_1BYTE_KIND:
9237 out = ucs1lib_rsplit(
9238 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9239 break;
9240 case PyUnicode_2BYTE_KIND:
9241 out = ucs2lib_rsplit(
9242 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9243 break;
9244 case PyUnicode_4BYTE_KIND:
9245 out = ucs4lib_rsplit(
9246 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9247 break;
9248 default:
9249 out = NULL;
9250 }
9251 if (kind1 != kind)
9252 PyMem_Free(buf1);
9253 if (kind2 != kind)
9254 PyMem_Free(buf2);
9255 return out;
9256}
9257
9258static Py_ssize_t
9259anylib_find(int kind, void *buf1, Py_ssize_t len1,
9260 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9261{
9262 switch(kind) {
9263 case PyUnicode_1BYTE_KIND:
9264 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9265 case PyUnicode_2BYTE_KIND:
9266 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9267 case PyUnicode_4BYTE_KIND:
9268 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9269 }
9270 assert(0);
9271 return -1;
9272}
9273
9274static Py_ssize_t
9275anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9276 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9277{
9278 switch(kind) {
9279 case PyUnicode_1BYTE_KIND:
9280 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9281 case PyUnicode_2BYTE_KIND:
9282 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9283 case PyUnicode_4BYTE_KIND:
9284 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9285 }
9286 assert(0);
9287 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009288}
9289
Alexander Belopolsky40018472011-02-26 01:02:56 +00009290static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291replace(PyObject *self, PyObject *str1,
9292 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 PyObject *u;
9295 char *sbuf = PyUnicode_DATA(self);
9296 char *buf1 = PyUnicode_DATA(str1);
9297 char *buf2 = PyUnicode_DATA(str2);
9298 int srelease = 0, release1 = 0, release2 = 0;
9299 int skind = PyUnicode_KIND(self);
9300 int kind1 = PyUnicode_KIND(str1);
9301 int kind2 = PyUnicode_KIND(str2);
9302 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9303 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9304 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305
9306 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009309 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 if (skind < kind1)
9312 /* substring too wide to be present */
9313 goto nothing;
9314
9315 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009316 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009317 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009319 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009321 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 Py_UCS4 u1, u2, maxchar;
9323 int mayshrink, rkind;
9324 u1 = PyUnicode_READ_CHAR(str1, 0);
9325 if (!findchar(sbuf, PyUnicode_KIND(self),
9326 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009327 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 u2 = PyUnicode_READ_CHAR(str2, 0);
9329 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9330 /* Replacing u1 with u2 may cause a maxchar reduction in the
9331 result string. */
9332 mayshrink = maxchar > 127;
9333 if (u2 > maxchar) {
9334 maxchar = u2;
9335 mayshrink = 0;
9336 }
9337 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009338 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009340 if (PyUnicode_CopyCharacters(u, 0,
9341 (PyObject*)self, 0, slen) < 0)
9342 {
9343 Py_DECREF(u);
9344 return NULL;
9345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 rkind = PyUnicode_KIND(u);
9347 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9348 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009349 if (--maxcount < 0)
9350 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 if (mayshrink) {
9354 PyObject *tmp = u;
9355 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9356 PyUnicode_GET_LENGTH(tmp));
9357 Py_DECREF(tmp);
9358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 int rkind = skind;
9361 char *res;
9362 if (kind1 < rkind) {
9363 /* widen substring */
9364 buf1 = _PyUnicode_AsKind(str1, rkind);
9365 if (!buf1) goto error;
9366 release1 = 1;
9367 }
9368 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009369 if (i < 0)
9370 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (rkind > kind2) {
9372 /* widen replacement */
9373 buf2 = _PyUnicode_AsKind(str2, rkind);
9374 if (!buf2) goto error;
9375 release2 = 1;
9376 }
9377 else if (rkind < kind2) {
9378 /* widen self and buf1 */
9379 rkind = kind2;
9380 if (release1) PyMem_Free(buf1);
9381 sbuf = _PyUnicode_AsKind(self, rkind);
9382 if (!sbuf) goto error;
9383 srelease = 1;
9384 buf1 = _PyUnicode_AsKind(str1, rkind);
9385 if (!buf1) goto error;
9386 release1 = 1;
9387 }
9388 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9389 if (!res) {
9390 PyErr_NoMemory();
9391 goto error;
9392 }
9393 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009394 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9396 buf2,
9397 PyUnicode_KIND_SIZE(rkind, len2));
9398 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009399
9400 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9402 slen-i,
9403 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009404 if (i == -1)
9405 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9407 buf2,
9408 PyUnicode_KIND_SIZE(rkind, len2));
9409 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411
9412 u = PyUnicode_FromKindAndData(rkind, res, slen);
9413 PyMem_Free(res);
9414 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 Py_ssize_t n, i, j, ires;
9419 Py_ssize_t product, new_size;
9420 int rkind = skind;
9421 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 if (kind1 < rkind) {
9424 buf1 = _PyUnicode_AsKind(str1, rkind);
9425 if (!buf1) goto error;
9426 release1 = 1;
9427 }
9428 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429 if (n == 0)
9430 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 if (kind2 < rkind) {
9432 buf2 = _PyUnicode_AsKind(str2, rkind);
9433 if (!buf2) goto error;
9434 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 else if (kind2 > rkind) {
9437 rkind = kind2;
9438 sbuf = _PyUnicode_AsKind(self, rkind);
9439 if (!sbuf) goto error;
9440 srelease = 1;
9441 if (release1) PyMem_Free(buf1);
9442 buf1 = _PyUnicode_AsKind(str1, rkind);
9443 if (!buf1) goto error;
9444 release1 = 1;
9445 }
9446 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9447 PyUnicode_GET_LENGTH(str1))); */
9448 product = n * (len2-len1);
9449 if ((product / (len2-len1)) != n) {
9450 PyErr_SetString(PyExc_OverflowError,
9451 "replace string is too long");
9452 goto error;
9453 }
9454 new_size = slen + product;
9455 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9456 PyErr_SetString(PyExc_OverflowError,
9457 "replace string is too long");
9458 goto error;
9459 }
9460 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9461 if (!res)
9462 goto error;
9463 ires = i = 0;
9464 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465 while (n-- > 0) {
9466 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 j = anylib_find(rkind,
9468 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9469 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009470 if (j == -1)
9471 break;
9472 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009473 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9475 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9476 PyUnicode_KIND_SIZE(rkind, j-i));
9477 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 }
9479 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 if (len2 > 0) {
9481 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9482 buf2,
9483 PyUnicode_KIND_SIZE(rkind, len2));
9484 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009489 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9491 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9492 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009493 } else {
9494 /* interleave */
9495 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9497 buf2,
9498 PyUnicode_KIND_SIZE(rkind, len2));
9499 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009500 if (--n <= 0)
9501 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9503 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9504 PyUnicode_KIND_SIZE(rkind, 1));
9505 ires++;
9506 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9509 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9510 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009513 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 if (srelease)
9516 PyMem_FREE(sbuf);
9517 if (release1)
9518 PyMem_FREE(buf1);
9519 if (release2)
9520 PyMem_FREE(buf2);
9521 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009522
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 if (srelease)
9526 PyMem_FREE(sbuf);
9527 if (release1)
9528 PyMem_FREE(buf1);
9529 if (release2)
9530 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531 if (PyUnicode_CheckExact(self)) {
9532 Py_INCREF(self);
9533 return (PyObject *) self;
9534 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009535 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 error:
9537 if (srelease && sbuf)
9538 PyMem_FREE(sbuf);
9539 if (release1 && buf1)
9540 PyMem_FREE(buf1);
9541 if (release2 && buf2)
9542 PyMem_FREE(buf2);
9543 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544}
9545
9546/* --- Unicode Object Methods --------------------------------------------- */
9547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009548PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550\n\
9551Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009552characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553
9554static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009555unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 return fixup(self, fixtitle);
9558}
9559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009560PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562\n\
9563Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009564have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565
9566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009567unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 return fixup(self, fixcapitalize);
9570}
9571
9572#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009573PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575\n\
9576Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009577normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
9579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009580unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
9582 PyObject *list;
9583 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009584 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 /* Split into words */
9587 list = split(self, NULL, -1);
9588 if (!list)
9589 return NULL;
9590
9591 /* Capitalize each word */
9592 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9593 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 if (item == NULL)
9596 goto onError;
9597 Py_DECREF(PyList_GET_ITEM(list, i));
9598 PyList_SET_ITEM(list, i, item);
9599 }
9600
9601 /* Join the words to form a new string */
9602 item = PyUnicode_Join(NULL, list);
9603
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 Py_DECREF(list);
9606 return (PyObject *)item;
9607}
9608#endif
9609
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009610/* Argument converter. Coerces to a single unicode character */
9611
9612static int
9613convert_uc(PyObject *obj, void *addr)
9614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009616 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009617
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 uniobj = PyUnicode_FromObject(obj);
9619 if (uniobj == NULL) {
9620 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 return 0;
9623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009625 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 Py_DECREF(uniobj);
9628 return 0;
9629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 Py_DECREF(uniobj);
9632 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009633}
9634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009635PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009638Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009639done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640
9641static PyObject *
9642unicode_center(PyUnicodeObject *self, PyObject *args)
9643{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009644 Py_ssize_t marg, left;
9645 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 Py_UCS4 fillchar = ' ';
9647
Victor Stinnere9a29352011-10-01 02:14:59 +02009648 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
Victor Stinnere9a29352011-10-01 02:14:59 +02009651 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 return NULL;
9653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 Py_INCREF(self);
9656 return (PyObject*) self;
9657 }
9658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 left = marg / 2 + (marg & width & 1);
9661
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009662 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
Marc-André Lemburge5034372000-08-08 08:04:29 +00009665#if 0
9666
9667/* This code should go into some future Unicode collation support
9668 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009669 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009670
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009671/* speedy UTF-16 code point order comparison */
9672/* gleaned from: */
9673/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9674
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009675static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009676{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009677 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009678 0, 0, 0, 0, 0, 0, 0, 0,
9679 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009680 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009681};
9682
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683static int
9684unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009686 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 Py_UNICODE *s1 = str1->str;
9689 Py_UNICODE *s2 = str2->str;
9690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 len1 = str1->_base._base.length;
9692 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009693
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009695 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009696
9697 c1 = *s1++;
9698 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009699
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (c1 > (1<<11) * 26)
9701 c1 += utf16Fixup[c1>>11];
9702 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009703 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009704 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009705
9706 if (c1 != c2)
9707 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009708
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009709 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
9711
9712 return (len1 < len2) ? -1 : (len1 != len2);
9713}
9714
Marc-André Lemburge5034372000-08-08 08:04:29 +00009715#else
9716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717/* This function assumes that str1 and str2 are readied by the caller. */
9718
Marc-André Lemburge5034372000-08-08 08:04:29 +00009719static int
9720unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 int kind1, kind2;
9723 void *data1, *data2;
9724 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 kind1 = PyUnicode_KIND(str1);
9727 kind2 = PyUnicode_KIND(str2);
9728 data1 = PyUnicode_DATA(str1);
9729 data2 = PyUnicode_DATA(str2);
9730 len1 = PyUnicode_GET_LENGTH(str1);
9731 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 for (i = 0; i < len1 && i < len2; ++i) {
9734 Py_UCS4 c1, c2;
9735 c1 = PyUnicode_READ(kind1, data1, i);
9736 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009737
9738 if (c1 != c2)
9739 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009740 }
9741
9742 return (len1 < len2) ? -1 : (len1 != len2);
9743}
9744
9745#endif
9746
Alexander Belopolsky40018472011-02-26 01:02:56 +00009747int
9748PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9751 if (PyUnicode_READY(left) == -1 ||
9752 PyUnicode_READY(right) == -1)
9753 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009754 return unicode_compare((PyUnicodeObject *)left,
9755 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009757 PyErr_Format(PyExc_TypeError,
9758 "Can't compare %.100s and %.100s",
9759 left->ob_type->tp_name,
9760 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 return -1;
9762}
9763
Martin v. Löwis5b222132007-06-10 09:51:05 +00009764int
9765PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 Py_ssize_t i;
9768 int kind;
9769 void *data;
9770 Py_UCS4 chr;
9771
Victor Stinner910337b2011-10-03 03:20:16 +02009772 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 if (PyUnicode_READY(uni) == -1)
9774 return -1;
9775 kind = PyUnicode_KIND(uni);
9776 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009777 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9779 if (chr != str[i])
9780 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009781 /* This check keeps Python strings that end in '\0' from comparing equal
9782 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009785 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009787 return 0;
9788}
9789
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009790
Benjamin Peterson29060642009-01-31 22:14:21 +00009791#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009793
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794PyObject *
9795PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009796{
9797 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009798
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009799 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9800 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 if (PyUnicode_READY(left) == -1 ||
9802 PyUnicode_READY(right) == -1)
9803 return NULL;
9804 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9805 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009806 if (op == Py_EQ) {
9807 Py_INCREF(Py_False);
9808 return Py_False;
9809 }
9810 if (op == Py_NE) {
9811 Py_INCREF(Py_True);
9812 return Py_True;
9813 }
9814 }
9815 if (left == right)
9816 result = 0;
9817 else
9818 result = unicode_compare((PyUnicodeObject *)left,
9819 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009820
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009821 /* Convert the return value to a Boolean */
9822 switch (op) {
9823 case Py_EQ:
9824 v = TEST_COND(result == 0);
9825 break;
9826 case Py_NE:
9827 v = TEST_COND(result != 0);
9828 break;
9829 case Py_LE:
9830 v = TEST_COND(result <= 0);
9831 break;
9832 case Py_GE:
9833 v = TEST_COND(result >= 0);
9834 break;
9835 case Py_LT:
9836 v = TEST_COND(result == -1);
9837 break;
9838 case Py_GT:
9839 v = TEST_COND(result == 1);
9840 break;
9841 default:
9842 PyErr_BadArgument();
9843 return NULL;
9844 }
9845 Py_INCREF(v);
9846 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848
Brian Curtindfc80e32011-08-10 20:28:54 -05009849 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009850}
9851
Alexander Belopolsky40018472011-02-26 01:02:56 +00009852int
9853PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009854{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 int kind1, kind2, kind;
9857 void *buf1, *buf2;
9858 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009859 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009860
9861 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 sub = PyUnicode_FromObject(element);
9863 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 PyErr_Format(PyExc_TypeError,
9865 "'in <string>' requires string as left operand, not %s",
9866 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009867 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (PyUnicode_READY(sub) == -1)
9870 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009871
Thomas Wouters477c8d52006-05-27 19:21:47 +00009872 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009873 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009874 Py_DECREF(sub);
9875 return -1;
9876 }
9877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 kind1 = PyUnicode_KIND(str);
9879 kind2 = PyUnicode_KIND(sub);
9880 kind = kind1 > kind2 ? kind1 : kind2;
9881 buf1 = PyUnicode_DATA(str);
9882 buf2 = PyUnicode_DATA(sub);
9883 if (kind1 != kind)
9884 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9885 if (!buf1) {
9886 Py_DECREF(sub);
9887 return -1;
9888 }
9889 if (kind2 != kind)
9890 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9891 if (!buf2) {
9892 Py_DECREF(sub);
9893 if (kind1 != kind) PyMem_Free(buf1);
9894 return -1;
9895 }
9896 len1 = PyUnicode_GET_LENGTH(str);
9897 len2 = PyUnicode_GET_LENGTH(sub);
9898
9899 switch(kind) {
9900 case PyUnicode_1BYTE_KIND:
9901 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9902 break;
9903 case PyUnicode_2BYTE_KIND:
9904 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9905 break;
9906 case PyUnicode_4BYTE_KIND:
9907 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9908 break;
9909 default:
9910 result = -1;
9911 assert(0);
9912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009913
9914 Py_DECREF(str);
9915 Py_DECREF(sub);
9916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (kind1 != kind)
9918 PyMem_Free(buf1);
9919 if (kind2 != kind)
9920 PyMem_Free(buf2);
9921
Guido van Rossum403d68b2000-03-13 15:55:09 +00009922 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009923}
9924
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925/* Concat to string or Unicode object giving a new Unicode object. */
9926
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927PyObject *
9928PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 PyObject *u = NULL, *v = NULL, *w;
9931 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932
9933 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
9941 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009942 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009946 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 }
9950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009952 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 w = PyUnicode_New(
9956 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9957 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009960 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9961 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009962 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009963 v, 0,
9964 PyUnicode_GET_LENGTH(v)) < 0)
9965 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 Py_DECREF(u);
9967 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Benjamin Peterson29060642009-01-31 22:14:21 +00009970 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971 Py_XDECREF(u);
9972 Py_XDECREF(v);
9973 return NULL;
9974}
9975
Victor Stinnerb0923652011-10-04 01:17:31 +02009976static void
9977unicode_append_inplace(PyObject **p_left, PyObject *right)
9978{
9979 Py_ssize_t left_len, right_len, new_len;
9980#ifdef Py_DEBUG
9981 Py_ssize_t copied;
9982#endif
9983
9984 assert(PyUnicode_IS_READY(*p_left));
9985 assert(PyUnicode_IS_READY(right));
9986
9987 left_len = PyUnicode_GET_LENGTH(*p_left);
9988 right_len = PyUnicode_GET_LENGTH(right);
9989 if (left_len > PY_SSIZE_T_MAX - right_len) {
9990 PyErr_SetString(PyExc_OverflowError,
9991 "strings are too large to concat");
9992 goto error;
9993 }
9994 new_len = left_len + right_len;
9995
9996 /* Now we own the last reference to 'left', so we can resize it
9997 * in-place.
9998 */
9999 if (unicode_resize(p_left, new_len) != 0) {
10000 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10001 * deallocated so it cannot be put back into
10002 * 'variable'. The MemoryError is raised when there
10003 * is no value in 'variable', which might (very
10004 * remotely) be a cause of incompatibilities.
10005 */
10006 goto error;
10007 }
10008 /* copy 'right' into the newly allocated area of 'left' */
10009#ifdef Py_DEBUG
10010 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10011 right, 0,
10012 right_len);
10013 assert(0 <= copied);
10014#else
10015 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10016#endif
10017 return;
10018
10019error:
10020 Py_DECREF(*p_left);
10021 *p_left = NULL;
10022}
10023
Walter Dörwald1ab83302007-05-18 17:15:44 +000010024void
Victor Stinner23e56682011-10-03 03:54:37 +020010025PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010026{
Victor Stinner23e56682011-10-03 03:54:37 +020010027 PyObject *left, *res;
10028
10029 if (p_left == NULL) {
10030 if (!PyErr_Occurred())
10031 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010032 return;
10033 }
Victor Stinner23e56682011-10-03 03:54:37 +020010034 left = *p_left;
10035 if (right == NULL || !PyUnicode_Check(left)) {
10036 if (!PyErr_Occurred())
10037 PyErr_BadInternalCall();
10038 goto error;
10039 }
10040
10041 if (PyUnicode_CheckExact(left) && left != unicode_empty
10042 && PyUnicode_CheckExact(right) && right != unicode_empty
10043 && unicode_resizable(left)
10044 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10045 || _PyUnicode_WSTR(left) != NULL))
10046 {
Victor Stinner23e56682011-10-03 03:54:37 +020010047 if (PyUnicode_READY(left))
10048 goto error;
10049 if (PyUnicode_READY(right))
10050 goto error;
10051
Victor Stinnerb0923652011-10-04 01:17:31 +020010052 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10053 to change the structure size, but characters are stored just after
10054 the structure, and so it requires to move all charactres which is
10055 not so different than duplicating the string. */
10056 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010057 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010058 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010059 return;
10060 }
10061 }
10062
10063 res = PyUnicode_Concat(left, right);
10064 if (res == NULL)
10065 goto error;
10066 Py_DECREF(left);
10067 *p_left = res;
10068 return;
10069
10070error:
10071 Py_DECREF(*p_left);
10072 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010073}
10074
10075void
10076PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010078 PyUnicode_Append(pleft, right);
10079 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010080}
10081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010082PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010086string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010087interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
10089static PyObject *
10090unicode_count(PyUnicodeObject *self, PyObject *args)
10091{
10092 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010093 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010094 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 int kind1, kind2, kind;
10097 void *buf1, *buf2;
10098 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099
Jesus Ceaac451502011-04-20 17:09:23 +020010100 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10101 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 kind1 = PyUnicode_KIND(self);
10105 kind2 = PyUnicode_KIND(substring);
10106 kind = kind1 > kind2 ? kind1 : kind2;
10107 buf1 = PyUnicode_DATA(self);
10108 buf2 = PyUnicode_DATA(substring);
10109 if (kind1 != kind)
10110 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10111 if (!buf1) {
10112 Py_DECREF(substring);
10113 return NULL;
10114 }
10115 if (kind2 != kind)
10116 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10117 if (!buf2) {
10118 Py_DECREF(substring);
10119 if (kind1 != kind) PyMem_Free(buf1);
10120 return NULL;
10121 }
10122 len1 = PyUnicode_GET_LENGTH(self);
10123 len2 = PyUnicode_GET_LENGTH(substring);
10124
10125 ADJUST_INDICES(start, end, len1);
10126 switch(kind) {
10127 case PyUnicode_1BYTE_KIND:
10128 iresult = ucs1lib_count(
10129 ((Py_UCS1*)buf1) + start, end - start,
10130 buf2, len2, PY_SSIZE_T_MAX
10131 );
10132 break;
10133 case PyUnicode_2BYTE_KIND:
10134 iresult = ucs2lib_count(
10135 ((Py_UCS2*)buf1) + start, end - start,
10136 buf2, len2, PY_SSIZE_T_MAX
10137 );
10138 break;
10139 case PyUnicode_4BYTE_KIND:
10140 iresult = ucs4lib_count(
10141 ((Py_UCS4*)buf1) + start, end - start,
10142 buf2, len2, PY_SSIZE_T_MAX
10143 );
10144 break;
10145 default:
10146 assert(0); iresult = 0;
10147 }
10148
10149 result = PyLong_FromSsize_t(iresult);
10150
10151 if (kind1 != kind)
10152 PyMem_Free(buf1);
10153 if (kind2 != kind)
10154 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
10156 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 return result;
10159}
10160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010161PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010162 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010164Encode S using the codec registered for encoding. Default encoding\n\
10165is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010166handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010167a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10168'xmlcharrefreplace' as well as any other name registered with\n\
10169codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
10171static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010172unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010174 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 char *encoding = NULL;
10176 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010177
Benjamin Peterson308d6372009-09-18 21:42:35 +000010178 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10179 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010181 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010182}
10183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010184PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186\n\
10187Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010188If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
10190static PyObject*
10191unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10192{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010193 Py_ssize_t i, j, line_pos, src_len, incr;
10194 Py_UCS4 ch;
10195 PyObject *u;
10196 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010198 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
10200 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202
Thomas Wouters7e474022000-07-16 12:04:32 +000010203 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010204 src_len = PyUnicode_GET_LENGTH(self);
10205 i = j = line_pos = 0;
10206 kind = PyUnicode_KIND(self);
10207 src_data = PyUnicode_DATA(self);
10208 for (; i < src_len; i++) {
10209 ch = PyUnicode_READ(kind, src_data, i);
10210 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010212 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010213 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010214 goto overflow;
10215 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010217 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010220 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010221 goto overflow;
10222 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010224 if (ch == '\n' || ch == '\r')
10225 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010227 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010228
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010230 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 if (!u)
10232 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010233 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234
Antoine Pitroue71d5742011-10-04 15:55:09 +020010235 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
Antoine Pitroue71d5742011-10-04 15:55:09 +020010237 for (; i < src_len; i++) {
10238 ch = PyUnicode_READ(kind, src_data, i);
10239 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010240 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010241 incr = tabsize - (line_pos % tabsize);
10242 line_pos += incr;
10243 while (incr--) {
10244 PyUnicode_WRITE(kind, dest_data, j, ' ');
10245 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010246 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010247 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010249 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010250 line_pos++;
10251 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010252 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010253 if (ch == '\n' || ch == '\r')
10254 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010256 }
10257 assert (j == PyUnicode_GET_LENGTH(u));
10258 if (PyUnicode_READY(u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 Py_DECREF(u);
10260 return NULL;
10261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010263
Antoine Pitroue71d5742011-10-04 15:55:09 +020010264 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010265 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267}
10268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010269PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010270 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271\n\
10272Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010273such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274arguments start and end are interpreted as in slice notation.\n\
10275\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010276Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277
10278static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280{
Jesus Ceaac451502011-04-20 17:09:23 +020010281 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010282 Py_ssize_t start;
10283 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285
Jesus Ceaac451502011-04-20 17:09:23 +020010286 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10287 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 if (PyUnicode_READY(self) == -1)
10291 return NULL;
10292 if (PyUnicode_READY(substring) == -1)
10293 return NULL;
10294
10295 result = any_find_slice(
10296 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10297 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010298 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299
10300 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (result == -2)
10303 return NULL;
10304
Christian Heimes217cfd12007-12-02 14:31:20 +000010305 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
10308static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010309unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010311 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10312 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315}
10316
Guido van Rossumc2504932007-09-18 19:42:40 +000010317/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010318 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010319static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010320unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321{
Guido van Rossumc2504932007-09-18 19:42:40 +000010322 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010323 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (_PyUnicode_HASH(self) != -1)
10326 return _PyUnicode_HASH(self);
10327 if (PyUnicode_READY(self) == -1)
10328 return -1;
10329 len = PyUnicode_GET_LENGTH(self);
10330
10331 /* The hash function as a macro, gets expanded three times below. */
10332#define HASH(P) \
10333 x = (Py_uhash_t)*P << 7; \
10334 while (--len >= 0) \
10335 x = (1000003*x) ^ (Py_uhash_t)*P++;
10336
10337 switch (PyUnicode_KIND(self)) {
10338 case PyUnicode_1BYTE_KIND: {
10339 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10340 HASH(c);
10341 break;
10342 }
10343 case PyUnicode_2BYTE_KIND: {
10344 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10345 HASH(s);
10346 break;
10347 }
10348 default: {
10349 Py_UCS4 *l;
10350 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10351 "Impossible switch case in unicode_hash");
10352 l = PyUnicode_4BYTE_DATA(self);
10353 HASH(l);
10354 break;
10355 }
10356 }
10357 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10358
Guido van Rossumc2504932007-09-18 19:42:40 +000010359 if (x == -1)
10360 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010362 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010366PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
10371static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010374 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010375 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010376 Py_ssize_t start;
10377 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
Jesus Ceaac451502011-04-20 17:09:23 +020010379 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10380 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (PyUnicode_READY(self) == -1)
10384 return NULL;
10385 if (PyUnicode_READY(substring) == -1)
10386 return NULL;
10387
10388 result = any_find_slice(
10389 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10390 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010391 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
10393 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 if (result == -2)
10396 return NULL;
10397
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 if (result < 0) {
10399 PyErr_SetString(PyExc_ValueError, "substring not found");
10400 return NULL;
10401 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010402
Christian Heimes217cfd12007-12-02 14:31:20 +000010403 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404}
10405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010406PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010409Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010410at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411
10412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010413unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 Py_ssize_t i, length;
10416 int kind;
10417 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418 int cased;
10419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (PyUnicode_READY(self) == -1)
10421 return NULL;
10422 length = PyUnicode_GET_LENGTH(self);
10423 kind = PyUnicode_KIND(self);
10424 data = PyUnicode_DATA(self);
10425
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 if (length == 1)
10428 return PyBool_FromLong(
10429 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010431 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010434
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 for (i = 0; i < length; i++) {
10437 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010438
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10440 return PyBool_FromLong(0);
10441 else if (!cased && Py_UNICODE_ISLOWER(ch))
10442 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010444 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445}
10446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010447PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010450Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010451at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452
10453static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010454unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 Py_ssize_t i, length;
10457 int kind;
10458 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459 int cased;
10460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (PyUnicode_READY(self) == -1)
10462 return NULL;
10463 length = PyUnicode_GET_LENGTH(self);
10464 kind = PyUnicode_KIND(self);
10465 data = PyUnicode_DATA(self);
10466
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 if (length == 1)
10469 return PyBool_FromLong(
10470 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010472 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010475
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 for (i = 0; i < length; i++) {
10478 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010479
Benjamin Peterson29060642009-01-31 22:14:21 +000010480 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10481 return PyBool_FromLong(0);
10482 else if (!cased && Py_UNICODE_ISUPPER(ch))
10483 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010485 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486}
10487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010488PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010491Return True if S is a titlecased string and there is at least one\n\
10492character in S, i.e. upper- and titlecase characters may only\n\
10493follow uncased characters and lowercase characters only cased ones.\n\
10494Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
10496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010497unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 Py_ssize_t i, length;
10500 int kind;
10501 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 int cased, previous_is_cased;
10503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (PyUnicode_READY(self) == -1)
10505 return NULL;
10506 length = PyUnicode_GET_LENGTH(self);
10507 kind = PyUnicode_KIND(self);
10508 data = PyUnicode_DATA(self);
10509
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (length == 1) {
10512 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10513 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10514 (Py_UNICODE_ISUPPER(ch) != 0));
10515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010517 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010520
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 cased = 0;
10522 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 for (i = 0; i < length; i++) {
10524 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010525
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10527 if (previous_is_cased)
10528 return PyBool_FromLong(0);
10529 previous_is_cased = 1;
10530 cased = 1;
10531 }
10532 else if (Py_UNICODE_ISLOWER(ch)) {
10533 if (!previous_is_cased)
10534 return PyBool_FromLong(0);
10535 previous_is_cased = 1;
10536 cased = 1;
10537 }
10538 else
10539 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010541 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542}
10543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010544PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010547Return True if all characters in S are whitespace\n\
10548and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
10550static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010551unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 Py_ssize_t i, length;
10554 int kind;
10555 void *data;
10556
10557 if (PyUnicode_READY(self) == -1)
10558 return NULL;
10559 length = PyUnicode_GET_LENGTH(self);
10560 kind = PyUnicode_KIND(self);
10561 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (length == 1)
10565 return PyBool_FromLong(
10566 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010568 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 for (i = 0; i < length; i++) {
10573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010574 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010577 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578}
10579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010580PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010582\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010583Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010585
10586static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010587unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 Py_ssize_t i, length;
10590 int kind;
10591 void *data;
10592
10593 if (PyUnicode_READY(self) == -1)
10594 return NULL;
10595 length = PyUnicode_GET_LENGTH(self);
10596 kind = PyUnicode_KIND(self);
10597 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010598
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 if (length == 1)
10601 return PyBool_FromLong(
10602 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010603
10604 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 for (i = 0; i < length; i++) {
10609 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010610 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010612 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010613}
10614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010615PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010617\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010618Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010619and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010620
10621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010622unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 int kind;
10625 void *data;
10626 Py_ssize_t len, i;
10627
10628 if (PyUnicode_READY(self) == -1)
10629 return NULL;
10630
10631 kind = PyUnicode_KIND(self);
10632 data = PyUnicode_DATA(self);
10633 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010634
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010635 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (len == 1) {
10637 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10638 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10639 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010640
10641 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 for (i = 0; i < len; i++) {
10646 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010647 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010650 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010651}
10652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010653PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010656Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010657False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658
10659static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010660unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 Py_ssize_t i, length;
10663 int kind;
10664 void *data;
10665
10666 if (PyUnicode_READY(self) == -1)
10667 return NULL;
10668 length = PyUnicode_GET_LENGTH(self);
10669 kind = PyUnicode_KIND(self);
10670 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (length == 1)
10674 return PyBool_FromLong(
10675 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010679 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 for (i = 0; i < length; i++) {
10682 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010685 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686}
10687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010688PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010691Return True if all characters in S are digits\n\
10692and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
10694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010695unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 Py_ssize_t i, length;
10698 int kind;
10699 void *data;
10700
10701 if (PyUnicode_READY(self) == -1)
10702 return NULL;
10703 length = PyUnicode_GET_LENGTH(self);
10704 kind = PyUnicode_KIND(self);
10705 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (length == 1) {
10709 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10710 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010713 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 for (i = 0; i < length; i++) {
10718 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010721 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010727Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010731unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 Py_ssize_t i, length;
10734 int kind;
10735 void *data;
10736
10737 if (PyUnicode_READY(self) == -1)
10738 return NULL;
10739 length = PyUnicode_GET_LENGTH(self);
10740 kind = PyUnicode_KIND(self);
10741 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (length == 1)
10745 return PyBool_FromLong(
10746 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010748 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 for (i = 0; i < length; i++) {
10753 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757}
10758
Martin v. Löwis47383402007-08-15 07:32:56 +000010759int
10760PyUnicode_IsIdentifier(PyObject *self)
10761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 int kind;
10763 void *data;
10764 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010765 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (PyUnicode_READY(self) == -1) {
10768 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 }
10771
10772 /* Special case for empty strings */
10773 if (PyUnicode_GET_LENGTH(self) == 0)
10774 return 0;
10775 kind = PyUnicode_KIND(self);
10776 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010777
10778 /* PEP 3131 says that the first character must be in
10779 XID_Start and subsequent characters in XID_Continue,
10780 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010782 letters, digits, underscore). However, given the current
10783 definition of XID_Start and XID_Continue, it is sufficient
10784 to check just for these, except that _ must be allowed
10785 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010787 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010788 return 0;
10789
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010790 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010793 return 1;
10794}
10795
10796PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010798\n\
10799Return True if S is a valid identifier according\n\
10800to the language definition.");
10801
10802static PyObject*
10803unicode_isidentifier(PyObject *self)
10804{
10805 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10806}
10807
Georg Brandl559e5d72008-06-11 18:37:52 +000010808PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010810\n\
10811Return True if all characters in S are considered\n\
10812printable in repr() or S is empty, False otherwise.");
10813
10814static PyObject*
10815unicode_isprintable(PyObject *self)
10816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 Py_ssize_t i, length;
10818 int kind;
10819 void *data;
10820
10821 if (PyUnicode_READY(self) == -1)
10822 return NULL;
10823 length = PyUnicode_GET_LENGTH(self);
10824 kind = PyUnicode_KIND(self);
10825 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010826
10827 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 if (length == 1)
10829 return PyBool_FromLong(
10830 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 for (i = 0; i < length; i++) {
10833 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010834 Py_RETURN_FALSE;
10835 }
10836 }
10837 Py_RETURN_TRUE;
10838}
10839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010840PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010841 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842\n\
10843Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010844iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845
10846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010847unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010849 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850}
10851
Martin v. Löwis18e16552006-02-15 17:27:45 +000010852static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853unicode_length(PyUnicodeObject *self)
10854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (PyUnicode_READY(self) == -1)
10856 return -1;
10857 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858}
10859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010863Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010864done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866static PyObject *
10867unicode_ljust(PyUnicodeObject *self, PyObject *args)
10868{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010869 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 Py_UCS4 fillchar = ' ';
10871
10872 if (PyUnicode_READY(self) == -1)
10873 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010874
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010875 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876 return NULL;
10877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879 Py_INCREF(self);
10880 return (PyObject*) self;
10881 }
10882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884}
10885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010886PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010889Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
10891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010892unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 return fixup(self, fixlower);
10895}
10896
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010897#define LEFTSTRIP 0
10898#define RIGHTSTRIP 1
10899#define BOTHSTRIP 2
10900
10901/* Arrays indexed by above */
10902static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10903
10904#define STRIPNAME(i) (stripformat[i]+3)
10905
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010906/* externally visible for str.strip(unicode) */
10907PyObject *
10908_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 void *data;
10911 int kind;
10912 Py_ssize_t i, j, len;
10913 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10916 return NULL;
10917
10918 kind = PyUnicode_KIND(self);
10919 data = PyUnicode_DATA(self);
10920 len = PyUnicode_GET_LENGTH(self);
10921 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10922 PyUnicode_DATA(sepobj),
10923 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924
Benjamin Peterson14339b62009-01-31 16:36:08 +000010925 i = 0;
10926 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 while (i < len &&
10928 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010929 i++;
10930 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010932
Benjamin Peterson14339b62009-01-31 16:36:08 +000010933 j = len;
10934 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 do {
10936 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 } while (j >= i &&
10938 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010940 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010941
Victor Stinner12bab6d2011-10-01 01:53:49 +020010942 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943}
10944
10945PyObject*
10946PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10947{
10948 unsigned char *data;
10949 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010950 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951
Victor Stinnerde636f32011-10-01 03:55:54 +020010952 if (PyUnicode_READY(self) == -1)
10953 return NULL;
10954
10955 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10956
Victor Stinner12bab6d2011-10-01 01:53:49 +020010957 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010959 if (PyUnicode_CheckExact(self)) {
10960 Py_INCREF(self);
10961 return self;
10962 }
10963 else
10964 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 }
10966
Victor Stinner12bab6d2011-10-01 01:53:49 +020010967 length = end - start;
10968 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010969 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970
Victor Stinnerde636f32011-10-01 03:55:54 +020010971 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010972 PyErr_SetString(PyExc_IndexError, "string index out of range");
10973 return NULL;
10974 }
10975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 kind = PyUnicode_KIND(self);
10977 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010978 return PyUnicode_FromKindAndData(kind,
10979 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010980 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982
10983static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010984do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 int kind;
10987 void *data;
10988 Py_ssize_t len, i, j;
10989
10990 if (PyUnicode_READY(self) == -1)
10991 return NULL;
10992
10993 kind = PyUnicode_KIND(self);
10994 data = PyUnicode_DATA(self);
10995 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010996
Benjamin Peterson14339b62009-01-31 16:36:08 +000010997 i = 0;
10998 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000 i++;
11001 }
11002 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011003
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 j = len;
11005 if (striptype != LEFTSTRIP) {
11006 do {
11007 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011009 j++;
11010 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011011
Victor Stinner12bab6d2011-10-01 01:53:49 +020011012 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013}
11014
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011015
11016static PyObject *
11017do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11018{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011020
Benjamin Peterson14339b62009-01-31 16:36:08 +000011021 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11022 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011023
Benjamin Peterson14339b62009-01-31 16:36:08 +000011024 if (sep != NULL && sep != Py_None) {
11025 if (PyUnicode_Check(sep))
11026 return _PyUnicode_XStrip(self, striptype, sep);
11027 else {
11028 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 "%s arg must be None or str",
11030 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011031 return NULL;
11032 }
11033 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011034
Benjamin Peterson14339b62009-01-31 16:36:08 +000011035 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011036}
11037
11038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011039PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011041\n\
11042Return a copy of the string S with leading and trailing\n\
11043whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011044If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011045
11046static PyObject *
11047unicode_strip(PyUnicodeObject *self, PyObject *args)
11048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011049 if (PyTuple_GET_SIZE(args) == 0)
11050 return do_strip(self, BOTHSTRIP); /* Common case */
11051 else
11052 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011053}
11054
11055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011056PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011058\n\
11059Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011060If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011061
11062static PyObject *
11063unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11064{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011065 if (PyTuple_GET_SIZE(args) == 0)
11066 return do_strip(self, LEFTSTRIP); /* Common case */
11067 else
11068 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011069}
11070
11071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011072PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011073 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011074\n\
11075Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011076If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011077
11078static PyObject *
11079unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11080{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011081 if (PyTuple_GET_SIZE(args) == 0)
11082 return do_strip(self, RIGHTSTRIP); /* Common case */
11083 else
11084 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011085}
11086
11087
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011089unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
11091 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093
Georg Brandl222de0f2009-04-12 12:01:50 +000011094 if (len < 1) {
11095 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011096 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098
Tim Peters7a29bd52001-09-12 03:03:31 +000011099 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100 /* no repeat, return original string */
11101 Py_INCREF(str);
11102 return (PyObject*) str;
11103 }
Tim Peters8f422462000-09-09 06:13:41 +000011104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (PyUnicode_READY(str) == -1)
11106 return NULL;
11107
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011108 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011109 PyErr_SetString(PyExc_OverflowError,
11110 "repeated string is too long");
11111 return NULL;
11112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 if (!u)
11117 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011118 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 if (PyUnicode_GET_LENGTH(str) == 1) {
11121 const int kind = PyUnicode_KIND(str);
11122 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11123 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011124 if (kind == PyUnicode_1BYTE_KIND)
11125 memset(to, (unsigned char)fill_char, len);
11126 else {
11127 for (n = 0; n < len; ++n)
11128 PyUnicode_WRITE(kind, to, n, fill_char);
11129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 }
11131 else {
11132 /* number of characters copied this far */
11133 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11134 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11135 char *to = (char *) PyUnicode_DATA(u);
11136 Py_MEMCPY(to, PyUnicode_DATA(str),
11137 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 n = (done <= nchars-done) ? done : nchars-done;
11140 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 }
11144
11145 return (PyObject*) u;
11146}
11147
Alexander Belopolsky40018472011-02-26 01:02:56 +000011148PyObject *
11149PyUnicode_Replace(PyObject *obj,
11150 PyObject *subobj,
11151 PyObject *replobj,
11152 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153{
11154 PyObject *self;
11155 PyObject *str1;
11156 PyObject *str2;
11157 PyObject *result;
11158
11159 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011160 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011163 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 Py_DECREF(self);
11165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 }
11167 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011168 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 Py_DECREF(self);
11170 Py_DECREF(str1);
11171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 Py_DECREF(self);
11175 Py_DECREF(str1);
11176 Py_DECREF(str2);
11177 return result;
11178}
11179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011180PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011181 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182\n\
11183Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011184old replaced by new. If the optional argument count is\n\
11185given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
11187static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 PyObject *str1;
11191 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011192 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193 PyObject *result;
11194
Martin v. Löwis18e16552006-02-15 17:27:45 +000011195 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011198 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 str1 = PyUnicode_FromObject(str1);
11200 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11201 return NULL;
11202 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011203 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 Py_DECREF(str1);
11205 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208 result = replace(self, str1, str2, maxcount);
11209
11210 Py_DECREF(str1);
11211 Py_DECREF(str2);
11212 return result;
11213}
11214
Alexander Belopolsky40018472011-02-26 01:02:56 +000011215static PyObject *
11216unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011218 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 Py_ssize_t isize;
11220 Py_ssize_t osize, squote, dquote, i, o;
11221 Py_UCS4 max, quote;
11222 int ikind, okind;
11223 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011226 return NULL;
11227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 isize = PyUnicode_GET_LENGTH(unicode);
11229 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 /* Compute length of output, quote characters, and
11232 maximum character */
11233 osize = 2; /* quotes */
11234 max = 127;
11235 squote = dquote = 0;
11236 ikind = PyUnicode_KIND(unicode);
11237 for (i = 0; i < isize; i++) {
11238 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11239 switch (ch) {
11240 case '\'': squote++; osize++; break;
11241 case '"': dquote++; osize++; break;
11242 case '\\': case '\t': case '\r': case '\n':
11243 osize += 2; break;
11244 default:
11245 /* Fast-path ASCII */
11246 if (ch < ' ' || ch == 0x7f)
11247 osize += 4; /* \xHH */
11248 else if (ch < 0x7f)
11249 osize++;
11250 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11251 osize++;
11252 max = ch > max ? ch : max;
11253 }
11254 else if (ch < 0x100)
11255 osize += 4; /* \xHH */
11256 else if (ch < 0x10000)
11257 osize += 6; /* \uHHHH */
11258 else
11259 osize += 10; /* \uHHHHHHHH */
11260 }
11261 }
11262
11263 quote = '\'';
11264 if (squote) {
11265 if (dquote)
11266 /* Both squote and dquote present. Use squote,
11267 and escape them */
11268 osize += squote;
11269 else
11270 quote = '"';
11271 }
11272
11273 repr = PyUnicode_New(osize, max);
11274 if (repr == NULL)
11275 return NULL;
11276 okind = PyUnicode_KIND(repr);
11277 odata = PyUnicode_DATA(repr);
11278
11279 PyUnicode_WRITE(okind, odata, 0, quote);
11280 PyUnicode_WRITE(okind, odata, osize-1, quote);
11281
11282 for (i = 0, o = 1; i < isize; i++) {
11283 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011284
11285 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if ((ch == quote) || (ch == '\\')) {
11287 PyUnicode_WRITE(okind, odata, o++, '\\');
11288 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011289 continue;
11290 }
11291
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011293 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 PyUnicode_WRITE(okind, odata, o++, '\\');
11295 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011296 }
11297 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 PyUnicode_WRITE(okind, odata, o++, '\\');
11299 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011300 }
11301 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 PyUnicode_WRITE(okind, odata, o++, '\\');
11303 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011304 }
11305
11306 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011307 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 PyUnicode_WRITE(okind, odata, o++, '\\');
11309 PyUnicode_WRITE(okind, odata, o++, 'x');
11310 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11311 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011312 }
11313
Georg Brandl559e5d72008-06-11 18:37:52 +000011314 /* Copy ASCII characters as-is */
11315 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011317 }
11318
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011320 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011321 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011322 (categories Z* and C* except ASCII space)
11323 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011325 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (ch <= 0xff) {
11327 PyUnicode_WRITE(okind, odata, o++, '\\');
11328 PyUnicode_WRITE(okind, odata, o++, 'x');
11329 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11330 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011331 }
11332 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 else if (ch >= 0x10000) {
11334 PyUnicode_WRITE(okind, odata, o++, '\\');
11335 PyUnicode_WRITE(okind, odata, o++, 'U');
11336 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11337 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11338 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11339 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11340 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11341 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11342 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11343 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011344 }
11345 /* Map 16-bit characters to '\uxxxx' */
11346 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 PyUnicode_WRITE(okind, odata, o++, '\\');
11348 PyUnicode_WRITE(okind, odata, o++, 'u');
11349 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11350 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11351 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11352 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011353 }
11354 }
11355 /* Copy characters as-is */
11356 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011358 }
11359 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011362 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363}
11364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367\n\
11368Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011369such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370arguments start and end are interpreted as in slice notation.\n\
11371\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373
11374static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376{
Jesus Ceaac451502011-04-20 17:09:23 +020011377 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011378 Py_ssize_t start;
11379 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011380 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381
Jesus Ceaac451502011-04-20 17:09:23 +020011382 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11383 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (PyUnicode_READY(self) == -1)
11387 return NULL;
11388 if (PyUnicode_READY(substring) == -1)
11389 return NULL;
11390
11391 result = any_find_slice(
11392 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11393 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011394 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (result == -2)
11399 return NULL;
11400
Christian Heimes217cfd12007-12-02 14:31:20 +000011401 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402}
11403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
11409static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411{
Jesus Ceaac451502011-04-20 17:09:23 +020011412 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011413 Py_ssize_t start;
11414 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011415 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
Jesus Ceaac451502011-04-20 17:09:23 +020011417 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11418 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 if (PyUnicode_READY(self) == -1)
11422 return NULL;
11423 if (PyUnicode_READY(substring) == -1)
11424 return NULL;
11425
11426 result = any_find_slice(
11427 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11428 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (result == -2)
11434 return NULL;
11435
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 if (result < 0) {
11437 PyErr_SetString(PyExc_ValueError, "substring not found");
11438 return NULL;
11439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440
Christian Heimes217cfd12007-12-02 14:31:20 +000011441 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442}
11443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011444PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011447Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011448done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450static PyObject *
11451unicode_rjust(PyUnicodeObject *self, PyObject *args)
11452{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011453 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 Py_UCS4 fillchar = ' ';
11455
Victor Stinnere9a29352011-10-01 02:14:59 +020011456 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011458
Victor Stinnere9a29352011-10-01 02:14:59 +020011459 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 return NULL;
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 Py_INCREF(self);
11464 return (PyObject*) self;
11465 }
11466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468}
11469
Alexander Belopolsky40018472011-02-26 01:02:56 +000011470PyObject *
11471PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472{
11473 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011474
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 s = PyUnicode_FromObject(s);
11476 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011477 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 if (sep != NULL) {
11479 sep = PyUnicode_FromObject(sep);
11480 if (sep == NULL) {
11481 Py_DECREF(s);
11482 return NULL;
11483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 }
11485
11486 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11487
11488 Py_DECREF(s);
11489 Py_XDECREF(sep);
11490 return result;
11491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
11496Return a list of the words in S, using sep as the\n\
11497delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011498splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011499whitespace string is a separator and empty strings are\n\
11500removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject*
11503unicode_split(PyUnicodeObject *self, PyObject *args)
11504{
11505 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011506 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Martin v. Löwis18e16552006-02-15 17:27:45 +000011508 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 return NULL;
11510
11511 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519PyObject *
11520PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11521{
11522 PyObject* str_obj;
11523 PyObject* sep_obj;
11524 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 int kind1, kind2, kind;
11526 void *buf1 = NULL, *buf2 = NULL;
11527 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011528
11529 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011530 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011532 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011534 Py_DECREF(str_obj);
11535 return NULL;
11536 }
11537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 kind1 = PyUnicode_KIND(str_in);
11539 kind2 = PyUnicode_KIND(sep_obj);
11540 kind = kind1 > kind2 ? kind1 : kind2;
11541 buf1 = PyUnicode_DATA(str_in);
11542 if (kind1 != kind)
11543 buf1 = _PyUnicode_AsKind(str_in, kind);
11544 if (!buf1)
11545 goto onError;
11546 buf2 = PyUnicode_DATA(sep_obj);
11547 if (kind2 != kind)
11548 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11549 if (!buf2)
11550 goto onError;
11551 len1 = PyUnicode_GET_LENGTH(str_obj);
11552 len2 = PyUnicode_GET_LENGTH(sep_obj);
11553
11554 switch(PyUnicode_KIND(str_in)) {
11555 case PyUnicode_1BYTE_KIND:
11556 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11557 break;
11558 case PyUnicode_2BYTE_KIND:
11559 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11560 break;
11561 case PyUnicode_4BYTE_KIND:
11562 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11563 break;
11564 default:
11565 assert(0);
11566 out = 0;
11567 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011568
11569 Py_DECREF(sep_obj);
11570 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 if (kind1 != kind)
11572 PyMem_Free(buf1);
11573 if (kind2 != kind)
11574 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011575
11576 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 onError:
11578 Py_DECREF(sep_obj);
11579 Py_DECREF(str_obj);
11580 if (kind1 != kind && buf1)
11581 PyMem_Free(buf1);
11582 if (kind2 != kind && buf2)
11583 PyMem_Free(buf2);
11584 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011585}
11586
11587
11588PyObject *
11589PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11590{
11591 PyObject* str_obj;
11592 PyObject* sep_obj;
11593 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 int kind1, kind2, kind;
11595 void *buf1 = NULL, *buf2 = NULL;
11596 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011597
11598 str_obj = PyUnicode_FromObject(str_in);
11599 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011601 sep_obj = PyUnicode_FromObject(sep_in);
11602 if (!sep_obj) {
11603 Py_DECREF(str_obj);
11604 return NULL;
11605 }
11606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 kind1 = PyUnicode_KIND(str_in);
11608 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011609 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 buf1 = PyUnicode_DATA(str_in);
11611 if (kind1 != kind)
11612 buf1 = _PyUnicode_AsKind(str_in, kind);
11613 if (!buf1)
11614 goto onError;
11615 buf2 = PyUnicode_DATA(sep_obj);
11616 if (kind2 != kind)
11617 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11618 if (!buf2)
11619 goto onError;
11620 len1 = PyUnicode_GET_LENGTH(str_obj);
11621 len2 = PyUnicode_GET_LENGTH(sep_obj);
11622
11623 switch(PyUnicode_KIND(str_in)) {
11624 case PyUnicode_1BYTE_KIND:
11625 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11626 break;
11627 case PyUnicode_2BYTE_KIND:
11628 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11629 break;
11630 case PyUnicode_4BYTE_KIND:
11631 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11632 break;
11633 default:
11634 assert(0);
11635 out = 0;
11636 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637
11638 Py_DECREF(sep_obj);
11639 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (kind1 != kind)
11641 PyMem_Free(buf1);
11642 if (kind2 != kind)
11643 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644
11645 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 onError:
11647 Py_DECREF(sep_obj);
11648 Py_DECREF(str_obj);
11649 if (kind1 != kind && buf1)
11650 PyMem_Free(buf1);
11651 if (kind2 != kind && buf2)
11652 PyMem_Free(buf2);
11653 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654}
11655
11656PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011658\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011659Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011660the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011661found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662
11663static PyObject*
11664unicode_partition(PyUnicodeObject *self, PyObject *separator)
11665{
11666 return PyUnicode_Partition((PyObject *)self, separator);
11667}
11668
11669PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011670 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011671\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011672Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011673the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011674separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011675
11676static PyObject*
11677unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11678{
11679 return PyUnicode_RPartition((PyObject *)self, separator);
11680}
11681
Alexander Belopolsky40018472011-02-26 01:02:56 +000011682PyObject *
11683PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011684{
11685 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011686
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011687 s = PyUnicode_FromObject(s);
11688 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011689 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 if (sep != NULL) {
11691 sep = PyUnicode_FromObject(sep);
11692 if (sep == NULL) {
11693 Py_DECREF(s);
11694 return NULL;
11695 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011696 }
11697
11698 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11699
11700 Py_DECREF(s);
11701 Py_XDECREF(sep);
11702 return result;
11703}
11704
11705PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011707\n\
11708Return a list of the words in S, using sep as the\n\
11709delimiter string, starting at the end of the string and\n\
11710working to the front. If maxsplit is given, at most maxsplit\n\
11711splits are done. If sep is not specified, any whitespace string\n\
11712is a separator.");
11713
11714static PyObject*
11715unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11716{
11717 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011718 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011719
Martin v. Löwis18e16552006-02-15 17:27:45 +000011720 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011721 return NULL;
11722
11723 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011725 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011727 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011729}
11730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011731PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733\n\
11734Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011735Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
11738static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011739unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011741 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011742 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011744 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11745 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 return NULL;
11747
Guido van Rossum86662912000-04-11 15:38:46 +000011748 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749}
11750
11751static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011752PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753{
Walter Dörwald346737f2007-05-31 10:44:43 +000011754 if (PyUnicode_CheckExact(self)) {
11755 Py_INCREF(self);
11756 return self;
11757 } else
11758 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011759 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
11765Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
11768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011769unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 return fixup(self, fixswapcase);
11772}
11773
Georg Brandlceee0772007-11-27 23:48:05 +000011774PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011776\n\
11777Return a translation table usable for str.translate().\n\
11778If there is only one argument, it must be a dictionary mapping Unicode\n\
11779ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011780Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011781If there are two arguments, they must be strings of equal length, and\n\
11782in the resulting dictionary, each character in x will be mapped to the\n\
11783character at the same position in y. If there is a third argument, it\n\
11784must be a string, whose characters will be mapped to None in the result.");
11785
11786static PyObject*
11787unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11788{
11789 PyObject *x, *y = NULL, *z = NULL;
11790 PyObject *new = NULL, *key, *value;
11791 Py_ssize_t i = 0;
11792 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793
Georg Brandlceee0772007-11-27 23:48:05 +000011794 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11795 return NULL;
11796 new = PyDict_New();
11797 if (!new)
11798 return NULL;
11799 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 int x_kind, y_kind, z_kind;
11801 void *x_data, *y_data, *z_data;
11802
Georg Brandlceee0772007-11-27 23:48:05 +000011803 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011804 if (!PyUnicode_Check(x)) {
11805 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11806 "be a string if there is a second argument");
11807 goto err;
11808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011810 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11811 "arguments must have equal length");
11812 goto err;
11813 }
11814 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 x_kind = PyUnicode_KIND(x);
11816 y_kind = PyUnicode_KIND(y);
11817 x_data = PyUnicode_DATA(x);
11818 y_data = PyUnicode_DATA(y);
11819 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11820 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11821 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011822 if (!key || !value)
11823 goto err;
11824 res = PyDict_SetItem(new, key, value);
11825 Py_DECREF(key);
11826 Py_DECREF(value);
11827 if (res < 0)
11828 goto err;
11829 }
11830 /* create entries for deleting chars in z */
11831 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 z_kind = PyUnicode_KIND(z);
11833 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011834 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011836 if (!key)
11837 goto err;
11838 res = PyDict_SetItem(new, key, Py_None);
11839 Py_DECREF(key);
11840 if (res < 0)
11841 goto err;
11842 }
11843 }
11844 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 int kind;
11846 void *data;
11847
Georg Brandlceee0772007-11-27 23:48:05 +000011848 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011849 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011850 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11851 "to maketrans it must be a dict");
11852 goto err;
11853 }
11854 /* copy entries into the new dict, converting string keys to int keys */
11855 while (PyDict_Next(x, &i, &key, &value)) {
11856 if (PyUnicode_Check(key)) {
11857 /* convert string keys to integer keys */
11858 PyObject *newkey;
11859 if (PyUnicode_GET_SIZE(key) != 1) {
11860 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11861 "table must be of length 1");
11862 goto err;
11863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 kind = PyUnicode_KIND(key);
11865 data = PyUnicode_DATA(key);
11866 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011867 if (!newkey)
11868 goto err;
11869 res = PyDict_SetItem(new, newkey, value);
11870 Py_DECREF(newkey);
11871 if (res < 0)
11872 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011873 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011874 /* just keep integer keys */
11875 if (PyDict_SetItem(new, key, value) < 0)
11876 goto err;
11877 } else {
11878 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11879 "be strings or integers");
11880 goto err;
11881 }
11882 }
11883 }
11884 return new;
11885 err:
11886 Py_DECREF(new);
11887 return NULL;
11888}
11889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011890PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892\n\
11893Return a copy of the string S, where all characters have been mapped\n\
11894through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011895Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011896Unmapped characters are left untouched. Characters mapped to None\n\
11897are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
11899static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011908Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
11910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011911unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 return fixup(self, fixupper);
11914}
11915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011916PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011919Pad a numeric string S with zeros on the left, to fill a field\n\
11920of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
11922static PyObject *
11923unicode_zfill(PyUnicodeObject *self, PyObject *args)
11924{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011925 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011927 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 int kind;
11929 void *data;
11930 Py_UCS4 chr;
11931
11932 if (PyUnicode_READY(self) == -1)
11933 return NULL;
11934
Martin v. Löwis18e16552006-02-15 17:27:45 +000011935 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 return NULL;
11937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011939 if (PyUnicode_CheckExact(self)) {
11940 Py_INCREF(self);
11941 return (PyObject*) self;
11942 }
11943 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011944 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945 }
11946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
11949 u = pad(self, fill, 0, '0');
11950
Walter Dörwald068325e2002-04-15 13:36:47 +000011951 if (u == NULL)
11952 return NULL;
11953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 kind = PyUnicode_KIND(u);
11955 data = PyUnicode_DATA(u);
11956 chr = PyUnicode_READ(kind, data, fill);
11957
11958 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 PyUnicode_WRITE(kind, data, 0, chr);
11961 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 }
11963
11964 return (PyObject*) u;
11965}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
11967#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011968static PyObject *
11969unicode__decimal2ascii(PyObject *self)
11970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011972}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973#endif
11974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011978Return True if S starts with the specified prefix, False otherwise.\n\
11979With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011980With optional end, stop comparing S at that position.\n\
11981prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
11983static PyObject *
11984unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011987 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011989 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011990 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011991 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
Jesus Ceaac451502011-04-20 17:09:23 +020011993 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011995 if (PyTuple_Check(subobj)) {
11996 Py_ssize_t i;
11997 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11998 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012000 if (substring == NULL)
12001 return NULL;
12002 result = tailmatch(self, substring, start, end, -1);
12003 Py_DECREF(substring);
12004 if (result) {
12005 Py_RETURN_TRUE;
12006 }
12007 }
12008 /* nothing matched */
12009 Py_RETURN_FALSE;
12010 }
12011 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012012 if (substring == NULL) {
12013 if (PyErr_ExceptionMatches(PyExc_TypeError))
12014 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12015 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012017 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012018 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012020 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021}
12022
12023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012024PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012027Return True if S ends with the specified suffix, False otherwise.\n\
12028With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012029With optional end, stop comparing S at that position.\n\
12030suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
12032static PyObject *
12033unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012036 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012038 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012039 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012040 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
Jesus Ceaac451502011-04-20 17:09:23 +020012042 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012044 if (PyTuple_Check(subobj)) {
12045 Py_ssize_t i;
12046 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12047 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012049 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012051 result = tailmatch(self, substring, start, end, +1);
12052 Py_DECREF(substring);
12053 if (result) {
12054 Py_RETURN_TRUE;
12055 }
12056 }
12057 Py_RETURN_FALSE;
12058 }
12059 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012060 if (substring == NULL) {
12061 if (PyErr_ExceptionMatches(PyExc_TypeError))
12062 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12063 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012065 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012066 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012068 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069}
12070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012072
12073PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012075\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012076Return a formatted version of S, using substitutions from args and kwargs.\n\
12077The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012078
Eric Smith27bbca62010-11-04 17:06:58 +000012079PyDoc_STRVAR(format_map__doc__,
12080 "S.format_map(mapping) -> str\n\
12081\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012082Return a formatted version of S, using substitutions from mapping.\n\
12083The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012084
Eric Smith4a7d76d2008-05-30 18:10:19 +000012085static PyObject *
12086unicode__format__(PyObject* self, PyObject* args)
12087{
12088 PyObject *format_spec;
12089
12090 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12091 return NULL;
12092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12094 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012095}
12096
Eric Smith8c663262007-08-25 02:26:07 +000012097PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012099\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012100Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012101
12102static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012103unicode__sizeof__(PyUnicodeObject *v)
12104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 Py_ssize_t size;
12106
12107 /* If it's a compact object, account for base structure +
12108 character data. */
12109 if (PyUnicode_IS_COMPACT_ASCII(v))
12110 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12111 else if (PyUnicode_IS_COMPACT(v))
12112 size = sizeof(PyCompactUnicodeObject) +
12113 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12114 else {
12115 /* If it is a two-block object, account for base object, and
12116 for character block if present. */
12117 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012118 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 size += (PyUnicode_GET_LENGTH(v) + 1) *
12120 PyUnicode_CHARACTER_SIZE(v);
12121 }
12122 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012123 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012124 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012126 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012127 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128
12129 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012130}
12131
12132PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012134
12135static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012136unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012137{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012138 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (!copy)
12140 return NULL;
12141 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012142}
12143
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144static PyMethodDef unicode_methods[] = {
12145
12146 /* Order is according to common usage: often used methods should
12147 appear first, since lookup is done sequentially. */
12148
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012149 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012150 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12151 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012152 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012153 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12154 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12155 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12156 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12157 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12158 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12159 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012161 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12162 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12163 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012165 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12166 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12167 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012168 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012169 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012170 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012172 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12173 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12174 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12175 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12176 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12177 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12178 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12179 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12180 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12181 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12182 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12183 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12184 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12185 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012186 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012187 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012188 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012189 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012190 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012191 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012192 {"maketrans", (PyCFunction) unicode_maketrans,
12193 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012194 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012195#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012196 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197#endif
12198
12199#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012200 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012201 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202#endif
12203
Benjamin Peterson14339b62009-01-31 16:36:08 +000012204 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 {NULL, NULL}
12206};
12207
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012208static PyObject *
12209unicode_mod(PyObject *v, PyObject *w)
12210{
Brian Curtindfc80e32011-08-10 20:28:54 -050012211 if (!PyUnicode_Check(v))
12212 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012214}
12215
12216static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 0, /*nb_add*/
12218 0, /*nb_subtract*/
12219 0, /*nb_multiply*/
12220 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012221};
12222
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012224 (lenfunc) unicode_length, /* sq_length */
12225 PyUnicode_Concat, /* sq_concat */
12226 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12227 (ssizeargfunc) unicode_getitem, /* sq_item */
12228 0, /* sq_slice */
12229 0, /* sq_ass_item */
12230 0, /* sq_ass_slice */
12231 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232};
12233
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012234static PyObject*
12235unicode_subscript(PyUnicodeObject* self, PyObject* item)
12236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_READY(self) == -1)
12238 return NULL;
12239
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012240 if (PyIndex_Check(item)) {
12241 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012242 if (i == -1 && PyErr_Occurred())
12243 return NULL;
12244 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012246 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012247 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012248 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012250 Py_UNICODE* result_buf;
12251 PyObject* result;
12252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012255 return NULL;
12256 }
12257
12258 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 return PyUnicode_New(0, 0);
12260 } else if (start == 0 && step == 1 &&
12261 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012262 PyUnicode_CheckExact(self)) {
12263 Py_INCREF(self);
12264 return (PyObject *)self;
12265 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012266 return PyUnicode_Substring((PyObject*)self,
12267 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012268 } else {
12269 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012270 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12271 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012272
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 if (result_buf == NULL)
12274 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012275
12276 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12277 result_buf[i] = source_buf[cur];
12278 }
Tim Petersced69f82003-09-16 20:30:58 +000012279
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012280 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012281 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012282 return result;
12283 }
12284 } else {
12285 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12286 return NULL;
12287 }
12288}
12289
12290static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012291 (lenfunc)unicode_length, /* mp_length */
12292 (binaryfunc)unicode_subscript, /* mp_subscript */
12293 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012294};
12295
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297/* Helpers for PyUnicode_Format() */
12298
12299static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012300getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012302 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 (*p_argidx)++;
12305 if (arglen < 0)
12306 return args;
12307 else
12308 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 }
12310 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312 return NULL;
12313}
12314
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012315/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012317static PyObject *
12318formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012320 char *p;
12321 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012323
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324 x = PyFloat_AsDouble(v);
12325 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012326 return NULL;
12327
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012330
Eric Smith0923d1d2009-04-16 20:16:10 +000012331 p = PyOS_double_to_string(x, type, prec,
12332 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012333 if (p == NULL)
12334 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012336 PyMem_Free(p);
12337 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338}
12339
Tim Peters38fd5b62000-09-21 05:43:11 +000012340static PyObject*
12341formatlong(PyObject *val, int flags, int prec, int type)
12342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012343 char *buf;
12344 int len;
12345 PyObject *str; /* temporary string object. */
12346 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012347
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12349 if (!str)
12350 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012352 Py_DECREF(str);
12353 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012354}
12355
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012358 size_t buflen,
12359 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012361 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012362 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (PyUnicode_GET_LENGTH(v) == 1) {
12364 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 buf[1] = '\0';
12366 return 1;
12367 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 goto onError;
12369 }
12370 else {
12371 /* Integer input truncated to a character */
12372 long x;
12373 x = PyLong_AsLong(v);
12374 if (x == -1 && PyErr_Occurred())
12375 goto onError;
12376
12377 if (x < 0 || x > 0x10ffff) {
12378 PyErr_SetString(PyExc_OverflowError,
12379 "%c arg not in range(0x110000)");
12380 return -1;
12381 }
12382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384 buf[1] = '\0';
12385 return 1;
12386 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012387
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012389 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012391 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392}
12393
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012394/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012395 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012396*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012397#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012398
Alexander Belopolsky40018472011-02-26 01:02:56 +000012399PyObject *
12400PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 void *fmt;
12403 int fmtkind;
12404 PyObject *result;
12405 Py_UCS4 *res, *res0;
12406 Py_UCS4 max;
12407 int kind;
12408 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012412
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 PyErr_BadInternalCall();
12415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12418 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 fmt = PyUnicode_DATA(uformat);
12421 fmtkind = PyUnicode_KIND(uformat);
12422 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12423 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424
12425 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12427 if (res0 == NULL) {
12428 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431
12432 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 arglen = PyTuple_Size(args);
12434 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435 }
12436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 arglen = -1;
12438 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012440 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012441 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443
12444 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 if (--rescnt < 0) {
12447 rescnt = fmtcnt + 100;
12448 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12450 if (res0 == NULL){
12451 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 }
12454 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012458 }
12459 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 /* Got a format specifier */
12461 int flags = 0;
12462 Py_ssize_t width = -1;
12463 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 Py_UCS4 c = '\0';
12465 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 int isnumok;
12467 PyObject *v = NULL;
12468 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 void *pbuf;
12470 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 Py_ssize_t len, len1;
12473 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 fmtpos++;
12476 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12477 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 Py_ssize_t keylen;
12479 PyObject *key;
12480 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012481
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 if (dict == NULL) {
12483 PyErr_SetString(PyExc_TypeError,
12484 "format requires a mapping");
12485 goto onError;
12486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 /* Skip over balanced parentheses */
12491 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 if (fmtcnt < 0 || pcount > 0) {
12500 PyErr_SetString(PyExc_ValueError,
12501 "incomplete format key");
12502 goto onError;
12503 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012504 key = PyUnicode_Substring((PyObject*)uformat,
12505 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 if (key == NULL)
12507 goto onError;
12508 if (args_owned) {
12509 Py_DECREF(args);
12510 args_owned = 0;
12511 }
12512 args = PyObject_GetItem(dict, key);
12513 Py_DECREF(key);
12514 if (args == NULL) {
12515 goto onError;
12516 }
12517 args_owned = 1;
12518 arglen = -1;
12519 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012520 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 case '-': flags |= F_LJUST; continue;
12524 case '+': flags |= F_SIGN; continue;
12525 case ' ': flags |= F_BLANK; continue;
12526 case '#': flags |= F_ALT; continue;
12527 case '0': flags |= F_ZERO; continue;
12528 }
12529 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 if (c == '*') {
12532 v = getnextarg(args, arglen, &argidx);
12533 if (v == NULL)
12534 goto onError;
12535 if (!PyLong_Check(v)) {
12536 PyErr_SetString(PyExc_TypeError,
12537 "* wants int");
12538 goto onError;
12539 }
12540 width = PyLong_AsLong(v);
12541 if (width == -1 && PyErr_Occurred())
12542 goto onError;
12543 if (width < 0) {
12544 flags |= F_LJUST;
12545 width = -width;
12546 }
12547 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 }
12550 else if (c >= '0' && c <= '9') {
12551 width = c - '0';
12552 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 if (c < '0' || c > '9')
12555 break;
12556 if ((width*10) / 10 != width) {
12557 PyErr_SetString(PyExc_ValueError,
12558 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012559 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 }
12561 width = width*10 + (c - '0');
12562 }
12563 }
12564 if (c == '.') {
12565 prec = 0;
12566 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 if (c == '*') {
12569 v = getnextarg(args, arglen, &argidx);
12570 if (v == NULL)
12571 goto onError;
12572 if (!PyLong_Check(v)) {
12573 PyErr_SetString(PyExc_TypeError,
12574 "* wants int");
12575 goto onError;
12576 }
12577 prec = PyLong_AsLong(v);
12578 if (prec == -1 && PyErr_Occurred())
12579 goto onError;
12580 if (prec < 0)
12581 prec = 0;
12582 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 }
12585 else if (c >= '0' && c <= '9') {
12586 prec = c - '0';
12587 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 if (c < '0' || c > '9')
12590 break;
12591 if ((prec*10) / 10 != prec) {
12592 PyErr_SetString(PyExc_ValueError,
12593 "prec too big");
12594 goto onError;
12595 }
12596 prec = prec*10 + (c - '0');
12597 }
12598 }
12599 } /* prec */
12600 if (fmtcnt >= 0) {
12601 if (c == 'h' || c == 'l' || c == 'L') {
12602 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 }
12605 }
12606 if (fmtcnt < 0) {
12607 PyErr_SetString(PyExc_ValueError,
12608 "incomplete format");
12609 goto onError;
12610 }
12611 if (c != '%') {
12612 v = getnextarg(args, arglen, &argidx);
12613 if (v == NULL)
12614 goto onError;
12615 }
12616 sign = 0;
12617 fill = ' ';
12618 switch (c) {
12619
12620 case '%':
12621 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 len = 1;
12626 break;
12627
12628 case 's':
12629 case 'r':
12630 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012631 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 temp = v;
12633 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012634 }
12635 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 if (c == 's')
12637 temp = PyObject_Str(v);
12638 else if (c == 'r')
12639 temp = PyObject_Repr(v);
12640 else
12641 temp = PyObject_ASCII(v);
12642 if (temp == NULL)
12643 goto onError;
12644 if (PyUnicode_Check(temp))
12645 /* nothing to do */;
12646 else {
12647 Py_DECREF(temp);
12648 PyErr_SetString(PyExc_TypeError,
12649 "%s argument has non-string str()");
12650 goto onError;
12651 }
12652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 if (PyUnicode_READY(temp) == -1) {
12654 Py_CLEAR(temp);
12655 goto onError;
12656 }
12657 pbuf = PyUnicode_DATA(temp);
12658 kind = PyUnicode_KIND(temp);
12659 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 if (prec >= 0 && len > prec)
12661 len = prec;
12662 break;
12663
12664 case 'i':
12665 case 'd':
12666 case 'u':
12667 case 'o':
12668 case 'x':
12669 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 isnumok = 0;
12671 if (PyNumber_Check(v)) {
12672 PyObject *iobj=NULL;
12673
12674 if (PyLong_Check(v)) {
12675 iobj = v;
12676 Py_INCREF(iobj);
12677 }
12678 else {
12679 iobj = PyNumber_Long(v);
12680 }
12681 if (iobj!=NULL) {
12682 if (PyLong_Check(iobj)) {
12683 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012684 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 Py_DECREF(iobj);
12686 if (!temp)
12687 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (PyUnicode_READY(temp) == -1) {
12689 Py_CLEAR(temp);
12690 goto onError;
12691 }
12692 pbuf = PyUnicode_DATA(temp);
12693 kind = PyUnicode_KIND(temp);
12694 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 sign = 1;
12696 }
12697 else {
12698 Py_DECREF(iobj);
12699 }
12700 }
12701 }
12702 if (!isnumok) {
12703 PyErr_Format(PyExc_TypeError,
12704 "%%%c format: a number is required, "
12705 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12706 goto onError;
12707 }
12708 if (flags & F_ZERO)
12709 fill = '0';
12710 break;
12711
12712 case 'e':
12713 case 'E':
12714 case 'f':
12715 case 'F':
12716 case 'g':
12717 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012718 temp = formatfloat(v, flags, prec, c);
12719 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 if (PyUnicode_READY(temp) == -1) {
12722 Py_CLEAR(temp);
12723 goto onError;
12724 }
12725 pbuf = PyUnicode_DATA(temp);
12726 kind = PyUnicode_KIND(temp);
12727 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 sign = 1;
12729 if (flags & F_ZERO)
12730 fill = '0';
12731 break;
12732
12733 case 'c':
12734 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012736 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 if (len < 0)
12738 goto onError;
12739 break;
12740
12741 default:
12742 PyErr_Format(PyExc_ValueError,
12743 "unsupported format character '%c' (0x%x) "
12744 "at index %zd",
12745 (31<=c && c<=126) ? (char)c : '?',
12746 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 goto onError;
12749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 /* pbuf is initialized here. */
12751 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12754 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12755 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 len--;
12757 }
12758 else if (flags & F_SIGN)
12759 sign = '+';
12760 else if (flags & F_BLANK)
12761 sign = ' ';
12762 else
12763 sign = 0;
12764 }
12765 if (width < len)
12766 width = len;
12767 if (rescnt - (sign != 0) < width) {
12768 reslen -= rescnt;
12769 rescnt = width + fmtcnt + 100;
12770 reslen += rescnt;
12771 if (reslen < 0) {
12772 Py_XDECREF(temp);
12773 PyErr_NoMemory();
12774 goto onError;
12775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12777 if (res0 == 0) {
12778 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 Py_XDECREF(temp);
12780 goto onError;
12781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 }
12784 if (sign) {
12785 if (fill != ' ')
12786 *res++ = sign;
12787 rescnt--;
12788 if (width > len)
12789 width--;
12790 }
12791 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12793 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12796 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 }
12798 rescnt -= 2;
12799 width -= 2;
12800 if (width < 0)
12801 width = 0;
12802 len -= 2;
12803 }
12804 if (width > len && !(flags & F_LJUST)) {
12805 do {
12806 --rescnt;
12807 *res++ = fill;
12808 } while (--width > len);
12809 }
12810 if (fill == ' ') {
12811 if (sign)
12812 *res++ = sign;
12813 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12815 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12816 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12817 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 }
12819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 /* Copy all characters, preserving len */
12821 len1 = len;
12822 while (len1--) {
12823 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12824 rescnt--;
12825 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 while (--width >= len) {
12827 --rescnt;
12828 *res++ = ' ';
12829 }
12830 if (dict && (argidx < arglen) && c != '%') {
12831 PyErr_SetString(PyExc_TypeError,
12832 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012833 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 goto onError;
12835 }
12836 Py_XDECREF(temp);
12837 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 } /* until end */
12839 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 PyErr_SetString(PyExc_TypeError,
12841 "not all arguments converted during string formatting");
12842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843 }
12844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845
12846 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12847 if (*res > max)
12848 max = *res;
12849 result = PyUnicode_New(reslen - rescnt, max);
12850 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 kind = PyUnicode_KIND(result);
12853 for (res = res0; res < res0+reslen-rescnt; res++)
12854 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12855 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858 }
12859 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860 return (PyObject *)result;
12861
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864 Py_DECREF(uformat);
12865 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867 }
12868 return NULL;
12869}
12870
Jeremy Hylton938ace62002-07-17 16:30:39 +000012871static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012872unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12873
Tim Peters6d6c1a32001-08-02 04:15:00 +000012874static PyObject *
12875unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12876{
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012878 static char *kwlist[] = {"object", "encoding", "errors", 0};
12879 char *encoding = NULL;
12880 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012881
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 if (type != &PyUnicode_Type)
12883 return unicode_subtype_new(type, args, kwds);
12884 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 return NULL;
12887 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012889 if (encoding == NULL && errors == NULL)
12890 return PyObject_Str(x);
12891 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012893}
12894
Guido van Rossume023fe02001-08-30 03:12:59 +000012895static PyObject *
12896unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12897{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012898 PyUnicodeObject *unicode, *self;
12899 Py_ssize_t length, char_size;
12900 int share_wstr, share_utf8;
12901 unsigned int kind;
12902 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012903
Benjamin Peterson14339b62009-01-31 16:36:08 +000012904 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012905
12906 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12907 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012908 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012909 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012910 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012911 return NULL;
12912
12913 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12914 if (self == NULL) {
12915 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012916 return NULL;
12917 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012918 kind = PyUnicode_KIND(unicode);
12919 length = PyUnicode_GET_LENGTH(unicode);
12920
12921 _PyUnicode_LENGTH(self) = length;
12922 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12923 _PyUnicode_STATE(self).interned = 0;
12924 _PyUnicode_STATE(self).kind = kind;
12925 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012926 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012927 _PyUnicode_STATE(self).ready = 1;
12928 _PyUnicode_WSTR(self) = NULL;
12929 _PyUnicode_UTF8_LENGTH(self) = 0;
12930 _PyUnicode_UTF8(self) = NULL;
12931 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012932 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012933
12934 share_utf8 = 0;
12935 share_wstr = 0;
12936 if (kind == PyUnicode_1BYTE_KIND) {
12937 char_size = 1;
12938 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12939 share_utf8 = 1;
12940 }
12941 else if (kind == PyUnicode_2BYTE_KIND) {
12942 char_size = 2;
12943 if (sizeof(wchar_t) == 2)
12944 share_wstr = 1;
12945 }
12946 else {
12947 assert(kind == PyUnicode_4BYTE_KIND);
12948 char_size = 4;
12949 if (sizeof(wchar_t) == 4)
12950 share_wstr = 1;
12951 }
12952
12953 /* Ensure we won't overflow the length. */
12954 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12955 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012957 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012958 data = PyObject_MALLOC((length + 1) * char_size);
12959 if (data == NULL) {
12960 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 goto onError;
12962 }
12963
Victor Stinnerc3c74152011-10-02 20:39:55 +020012964 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012965 if (share_utf8) {
12966 _PyUnicode_UTF8_LENGTH(self) = length;
12967 _PyUnicode_UTF8(self) = data;
12968 }
12969 if (share_wstr) {
12970 _PyUnicode_WSTR_LENGTH(self) = length;
12971 _PyUnicode_WSTR(self) = (wchar_t *)data;
12972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012974 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12975 PyUnicode_KIND_SIZE(kind, length + 1));
12976 Py_DECREF(unicode);
12977 return (PyObject *)self;
12978
12979onError:
12980 Py_DECREF(unicode);
12981 Py_DECREF(self);
12982 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012983}
12984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012985PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012987\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012988Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012989encoding defaults to the current default string encoding.\n\
12990errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012991
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012992static PyObject *unicode_iter(PyObject *seq);
12993
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012995 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 "str", /* tp_name */
12997 sizeof(PyUnicodeObject), /* tp_size */
12998 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 (destructor)unicode_dealloc, /* tp_dealloc */
13001 0, /* tp_print */
13002 0, /* tp_getattr */
13003 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013004 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 unicode_repr, /* tp_repr */
13006 &unicode_as_number, /* tp_as_number */
13007 &unicode_as_sequence, /* tp_as_sequence */
13008 &unicode_as_mapping, /* tp_as_mapping */
13009 (hashfunc) unicode_hash, /* tp_hash*/
13010 0, /* tp_call*/
13011 (reprfunc) unicode_str, /* tp_str */
13012 PyObject_GenericGetAttr, /* tp_getattro */
13013 0, /* tp_setattro */
13014 0, /* tp_as_buffer */
13015 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013017 unicode_doc, /* tp_doc */
13018 0, /* tp_traverse */
13019 0, /* tp_clear */
13020 PyUnicode_RichCompare, /* tp_richcompare */
13021 0, /* tp_weaklistoffset */
13022 unicode_iter, /* tp_iter */
13023 0, /* tp_iternext */
13024 unicode_methods, /* tp_methods */
13025 0, /* tp_members */
13026 0, /* tp_getset */
13027 &PyBaseObject_Type, /* tp_base */
13028 0, /* tp_dict */
13029 0, /* tp_descr_get */
13030 0, /* tp_descr_set */
13031 0, /* tp_dictoffset */
13032 0, /* tp_init */
13033 0, /* tp_alloc */
13034 unicode_new, /* tp_new */
13035 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036};
13037
13038/* Initialize the Unicode implementation */
13039
Thomas Wouters78890102000-07-22 19:25:51 +000013040void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013042 int i;
13043
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013046 0x000A, /* LINE FEED */
13047 0x000D, /* CARRIAGE RETURN */
13048 0x001C, /* FILE SEPARATOR */
13049 0x001D, /* GROUP SEPARATOR */
13050 0x001E, /* RECORD SEPARATOR */
13051 0x0085, /* NEXT LINE */
13052 0x2028, /* LINE SEPARATOR */
13053 0x2029, /* PARAGRAPH SEPARATOR */
13054 };
13055
Fred Drakee4315f52000-05-09 19:53:39 +000013056 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013057 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013058 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013060
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013061 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013063 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013065
13066 /* initialize the linebreak bloom filter */
13067 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013069 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013070
13071 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072}
13073
13074/* Finalize the Unicode implementation */
13075
Christian Heimesa156e092008-02-16 07:38:31 +000013076int
13077PyUnicode_ClearFreeList(void)
13078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013080}
13081
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082void
Thomas Wouters78890102000-07-22 19:25:51 +000013083_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013085 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013087 Py_XDECREF(unicode_empty);
13088 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013090 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 if (unicode_latin1[i]) {
13092 Py_DECREF(unicode_latin1[i]);
13093 unicode_latin1[i] = NULL;
13094 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013095 }
Christian Heimesa156e092008-02-16 07:38:31 +000013096 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013098
Walter Dörwald16807132007-05-25 13:52:07 +000013099void
13100PyUnicode_InternInPlace(PyObject **p)
13101{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013102 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13103 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013104#ifdef Py_DEBUG
13105 assert(s != NULL);
13106 assert(_PyUnicode_CHECK(s));
13107#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013109 return;
13110#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013111 /* If it's a subclass, we don't really know what putting
13112 it in the interned dict might do. */
13113 if (!PyUnicode_CheckExact(s))
13114 return;
13115 if (PyUnicode_CHECK_INTERNED(s))
13116 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013117 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013118 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 return;
13120 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013121 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 if (interned == NULL) {
13123 interned = PyDict_New();
13124 if (interned == NULL) {
13125 PyErr_Clear(); /* Don't leave an exception */
13126 return;
13127 }
13128 }
13129 /* It might be that the GetItem call fails even
13130 though the key is present in the dictionary,
13131 namely when this happens during a stack overflow. */
13132 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013135
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 if (t) {
13137 Py_INCREF(t);
13138 Py_DECREF(*p);
13139 *p = t;
13140 return;
13141 }
Walter Dörwald16807132007-05-25 13:52:07 +000013142
Benjamin Peterson14339b62009-01-31 16:36:08 +000013143 PyThreadState_GET()->recursion_critical = 1;
13144 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13145 PyErr_Clear();
13146 PyThreadState_GET()->recursion_critical = 0;
13147 return;
13148 }
13149 PyThreadState_GET()->recursion_critical = 0;
13150 /* The two references in interned are not counted by refcnt.
13151 The deallocator will take care of this */
13152 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013154}
13155
13156void
13157PyUnicode_InternImmortal(PyObject **p)
13158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13160
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 PyUnicode_InternInPlace(p);
13162 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013164 Py_INCREF(*p);
13165 }
Walter Dörwald16807132007-05-25 13:52:07 +000013166}
13167
13168PyObject *
13169PyUnicode_InternFromString(const char *cp)
13170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 PyObject *s = PyUnicode_FromString(cp);
13172 if (s == NULL)
13173 return NULL;
13174 PyUnicode_InternInPlace(&s);
13175 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013176}
13177
Alexander Belopolsky40018472011-02-26 01:02:56 +000013178void
13179_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 PyObject *keys;
13182 PyUnicodeObject *s;
13183 Py_ssize_t i, n;
13184 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013185
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 if (interned == NULL || !PyDict_Check(interned))
13187 return;
13188 keys = PyDict_Keys(interned);
13189 if (keys == NULL || !PyList_Check(keys)) {
13190 PyErr_Clear();
13191 return;
13192 }
Walter Dörwald16807132007-05-25 13:52:07 +000013193
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13195 detector, interned unicode strings are not forcibly deallocated;
13196 rather, we give them their stolen references back, and then clear
13197 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013198
Benjamin Peterson14339b62009-01-31 16:36:08 +000013199 n = PyList_GET_SIZE(keys);
13200 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 for (i = 0; i < n; i++) {
13203 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 if (PyUnicode_READY(s) == -1)
13205 fprintf(stderr, "could not ready string\n");
13206 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013207 case SSTATE_NOT_INTERNED:
13208 /* XXX Shouldn't happen */
13209 break;
13210 case SSTATE_INTERNED_IMMORTAL:
13211 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013213 break;
13214 case SSTATE_INTERNED_MORTAL:
13215 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 break;
13218 default:
13219 Py_FatalError("Inconsistent interned string state.");
13220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013222 }
13223 fprintf(stderr, "total size of all interned strings: "
13224 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13225 "mortal/immortal\n", mortal_size, immortal_size);
13226 Py_DECREF(keys);
13227 PyDict_Clear(interned);
13228 Py_DECREF(interned);
13229 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013230}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013231
13232
13233/********************* Unicode Iterator **************************/
13234
13235typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 PyObject_HEAD
13237 Py_ssize_t it_index;
13238 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013239} unicodeiterobject;
13240
13241static void
13242unicodeiter_dealloc(unicodeiterobject *it)
13243{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 _PyObject_GC_UNTRACK(it);
13245 Py_XDECREF(it->it_seq);
13246 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013247}
13248
13249static int
13250unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 Py_VISIT(it->it_seq);
13253 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013254}
13255
13256static PyObject *
13257unicodeiter_next(unicodeiterobject *it)
13258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013259 PyUnicodeObject *seq;
13260 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013261
Benjamin Peterson14339b62009-01-31 16:36:08 +000013262 assert(it != NULL);
13263 seq = it->it_seq;
13264 if (seq == NULL)
13265 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013266 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13269 int kind = PyUnicode_KIND(seq);
13270 void *data = PyUnicode_DATA(seq);
13271 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13272 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 if (item != NULL)
13274 ++it->it_index;
13275 return item;
13276 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013277
Benjamin Peterson14339b62009-01-31 16:36:08 +000013278 Py_DECREF(seq);
13279 it->it_seq = NULL;
13280 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013281}
13282
13283static PyObject *
13284unicodeiter_len(unicodeiterobject *it)
13285{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 Py_ssize_t len = 0;
13287 if (it->it_seq)
13288 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13289 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013290}
13291
13292PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13293
13294static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013295 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013298};
13299
13300PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13302 "str_iterator", /* tp_name */
13303 sizeof(unicodeiterobject), /* tp_basicsize */
13304 0, /* tp_itemsize */
13305 /* methods */
13306 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13307 0, /* tp_print */
13308 0, /* tp_getattr */
13309 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013310 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 0, /* tp_repr */
13312 0, /* tp_as_number */
13313 0, /* tp_as_sequence */
13314 0, /* tp_as_mapping */
13315 0, /* tp_hash */
13316 0, /* tp_call */
13317 0, /* tp_str */
13318 PyObject_GenericGetAttr, /* tp_getattro */
13319 0, /* tp_setattro */
13320 0, /* tp_as_buffer */
13321 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13322 0, /* tp_doc */
13323 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13324 0, /* tp_clear */
13325 0, /* tp_richcompare */
13326 0, /* tp_weaklistoffset */
13327 PyObject_SelfIter, /* tp_iter */
13328 (iternextfunc)unicodeiter_next, /* tp_iternext */
13329 unicodeiter_methods, /* tp_methods */
13330 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013331};
13332
13333static PyObject *
13334unicode_iter(PyObject *seq)
13335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013337
Benjamin Peterson14339b62009-01-31 16:36:08 +000013338 if (!PyUnicode_Check(seq)) {
13339 PyErr_BadInternalCall();
13340 return NULL;
13341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 if (PyUnicode_READY(seq) == -1)
13343 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013344 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13345 if (it == NULL)
13346 return NULL;
13347 it->it_index = 0;
13348 Py_INCREF(seq);
13349 it->it_seq = (PyUnicodeObject *)seq;
13350 _PyObject_GC_TRACK(it);
13351 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013352}
13353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354#define UNIOP(x) Py_UNICODE_##x
13355#define UNIOP_t Py_UNICODE
13356#include "uniops.h"
13357#undef UNIOP
13358#undef UNIOP_t
13359#define UNIOP(x) Py_UCS4_##x
13360#define UNIOP_t Py_UCS4
13361#include "uniops.h"
13362#undef UNIOP
13363#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013364
Victor Stinner71133ff2010-09-01 23:43:53 +000013365Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013366PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013367{
13368 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13369 Py_UNICODE *copy;
13370 Py_ssize_t size;
13371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 if (!PyUnicode_Check(unicode)) {
13373 PyErr_BadArgument();
13374 return NULL;
13375 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013376 /* Ensure we won't overflow the size. */
13377 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13378 PyErr_NoMemory();
13379 return NULL;
13380 }
13381 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13382 size *= sizeof(Py_UNICODE);
13383 copy = PyMem_Malloc(size);
13384 if (copy == NULL) {
13385 PyErr_NoMemory();
13386 return NULL;
13387 }
13388 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13389 return copy;
13390}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013391
Georg Brandl66c221e2010-10-14 07:04:07 +000013392/* A _string module, to export formatter_parser and formatter_field_name_split
13393 to the string.Formatter class implemented in Python. */
13394
13395static PyMethodDef _string_methods[] = {
13396 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13397 METH_O, PyDoc_STR("split the argument as a field name")},
13398 {"formatter_parser", (PyCFunction) formatter_parser,
13399 METH_O, PyDoc_STR("parse the argument as a format string")},
13400 {NULL, NULL}
13401};
13402
13403static struct PyModuleDef _string_module = {
13404 PyModuleDef_HEAD_INIT,
13405 "_string",
13406 PyDoc_STR("string helper module"),
13407 0,
13408 _string_methods,
13409 NULL,
13410 NULL,
13411 NULL,
13412 NULL
13413};
13414
13415PyMODINIT_FUNC
13416PyInit__string(void)
13417{
13418 return PyModule_Create(&_string_module);
13419}
13420
13421
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013422#ifdef __cplusplus
13423}
13424#endif