blob: 028cada2c132f9fe031af5a218ab13367c318e29 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200972 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001016 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001018 /* slow path to check for character overflow */
1019 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1020 Py_UCS4 ch, maxchar;
1021 Py_ssize_t i;
1022
1023 maxchar = 0;
1024 invalid_kinds = 0;
1025 for (i=0; i < how_many; i++) {
1026 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1027 if (ch > maxchar) {
1028 maxchar = ch;
1029 if (maxchar > to_maxchar) {
1030 invalid_kinds = 1;
1031 break;
1032 }
1033 }
1034 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1035 }
1036 }
1037 else
1038 invalid_kinds = 1;
1039 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001040 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001041 "Cannot copy %s characters "
1042 "into a string of %s characters",
1043 unicode_kind_name(from),
1044 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001045 return -1;
1046 }
1047 }
1048 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049}
1050
Victor Stinner17222162011-09-28 22:15:37 +02001051/* Find the maximum code point and count the number of surrogate pairs so a
1052 correct string length can be computed before converting a string to UCS4.
1053 This function counts single surrogates as a character and not as a pair.
1054
1055 Return 0 on success, or -1 on error. */
1056static int
1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1058 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059{
1060 const wchar_t *iter;
1061
Victor Stinnerc53be962011-10-02 21:33:54 +02001062 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 if (num_surrogates == NULL || maxchar == NULL) {
1064 PyErr_SetString(PyExc_SystemError,
1065 "unexpected NULL arguments to "
1066 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1067 return -1;
1068 }
1069
1070 *num_surrogates = 0;
1071 *maxchar = 0;
1072
1073 for (iter = begin; iter < end; ) {
1074 if (*iter > *maxchar)
1075 *maxchar = *iter;
1076#if SIZEOF_WCHAR_T == 2
1077 if (*iter >= 0xD800 && *iter <= 0xDBFF
1078 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1079 {
1080 Py_UCS4 surrogate_val;
1081 surrogate_val = (((iter[0] & 0x3FF)<<10)
1082 | (iter[1] & 0x3FF)) + 0x10000;
1083 ++(*num_surrogates);
1084 if (surrogate_val > *maxchar)
1085 *maxchar = surrogate_val;
1086 iter += 2;
1087 }
1088 else
1089 iter++;
1090#else
1091 iter++;
1092#endif
1093 }
1094 return 0;
1095}
1096
1097#ifdef Py_DEBUG
1098int unicode_ready_calls = 0;
1099#endif
1100
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001101static int
1102unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001104 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 wchar_t *end;
1106 Py_UCS4 maxchar = 0;
1107 Py_ssize_t num_surrogates;
1108#if SIZEOF_WCHAR_T == 2
1109 Py_ssize_t length_wo_surrogates;
1110#endif
1111
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001112 assert(p_obj != NULL);
1113 unicode = (PyUnicodeObject *)*p_obj;
1114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001116 strings were created using _PyObject_New() and where no canonical
1117 representation (the str field) has been set yet aka strings
1118 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001119 assert(_PyUnicode_CHECK(unicode));
1120 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001122 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001123 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001124 /* Actually, it should neither be interned nor be anything else: */
1125 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
1127#ifdef Py_DEBUG
1128 ++unicode_ready_calls;
1129#endif
1130
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001131#ifdef Py_DEBUG
1132 assert(!replace || Py_REFCNT(unicode) == 1);
1133#else
1134 if (replace && Py_REFCNT(unicode) != 1)
1135 replace = 0;
1136#endif
1137 if (replace) {
1138 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1139 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1140 /* Optimization for empty strings */
1141 if (len == 0) {
1142 Py_INCREF(unicode_empty);
1143 Py_DECREF(*p_obj);
1144 *p_obj = unicode_empty;
1145 return 0;
1146 }
1147 if (len == 1 && wstr[0] < 256) {
1148 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1149 if (latin1_char == NULL)
1150 return -1;
1151 Py_DECREF(*p_obj);
1152 *p_obj = latin1_char;
1153 return 0;
1154 }
1155 }
1156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001158 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001159 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
1162 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001163 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1164 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 PyErr_NoMemory();
1166 return -1;
1167 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001168 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 _PyUnicode_WSTR(unicode), end,
1170 PyUnicode_1BYTE_DATA(unicode));
1171 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1172 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1173 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1174 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001176 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001177 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
1179 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001180 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001181 _PyUnicode_UTF8(unicode) = NULL;
1182 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183 }
1184 PyObject_FREE(_PyUnicode_WSTR(unicode));
1185 _PyUnicode_WSTR(unicode) = NULL;
1186 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1187 }
1188 /* In this case we might have to convert down from 4-byte native
1189 wchar_t to 2-byte unicode. */
1190 else if (maxchar < 65536) {
1191 assert(num_surrogates == 0 &&
1192 "FindMaxCharAndNumSurrogatePairs() messed up");
1193
Victor Stinner506f5922011-09-28 22:34:18 +02001194#if SIZEOF_WCHAR_T == 2
1195 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001196 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001197 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1198 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1199 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001200 _PyUnicode_UTF8(unicode) = NULL;
1201 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001202#else
1203 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001204 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001205 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001207 PyErr_NoMemory();
1208 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 }
Victor Stinner506f5922011-09-28 22:34:18 +02001210 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1211 _PyUnicode_WSTR(unicode), end,
1212 PyUnicode_2BYTE_DATA(unicode));
1213 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1214 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1215 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001216 _PyUnicode_UTF8(unicode) = NULL;
1217 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001218 PyObject_FREE(_PyUnicode_WSTR(unicode));
1219 _PyUnicode_WSTR(unicode) = NULL;
1220 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1221#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 }
1223 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1224 else {
1225#if SIZEOF_WCHAR_T == 2
1226 /* in case the native representation is 2-bytes, we need to allocate a
1227 new normalized 4-byte version. */
1228 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001229 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1230 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 PyErr_NoMemory();
1232 return -1;
1233 }
1234 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1235 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001236 _PyUnicode_UTF8(unicode) = NULL;
1237 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001238 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1239 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001240 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 PyObject_FREE(_PyUnicode_WSTR(unicode));
1242 _PyUnicode_WSTR(unicode) = NULL;
1243 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1244#else
1245 assert(num_surrogates == 0);
1246
Victor Stinnerc3c74152011-10-02 20:39:55 +02001247 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001249 _PyUnicode_UTF8(unicode) = NULL;
1250 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1252#endif
1253 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1254 }
1255 _PyUnicode_STATE(unicode).ready = 1;
1256 return 0;
1257}
1258
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001259int
1260_PyUnicode_ReadyReplace(PyObject **op)
1261{
1262 return unicode_ready(op, 1);
1263}
1264
1265int
1266_PyUnicode_Ready(PyObject *op)
1267{
1268 return unicode_ready(&op, 0);
1269}
1270
Alexander Belopolsky40018472011-02-26 01:02:56 +00001271static void
1272unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273{
Walter Dörwald16807132007-05-25 13:52:07 +00001274 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001275 case SSTATE_NOT_INTERNED:
1276 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001277
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 case SSTATE_INTERNED_MORTAL:
1279 /* revive dead object temporarily for DelItem */
1280 Py_REFCNT(unicode) = 3;
1281 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1282 Py_FatalError(
1283 "deletion of interned string failed");
1284 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001285
Benjamin Peterson29060642009-01-31 22:14:21 +00001286 case SSTATE_INTERNED_IMMORTAL:
1287 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001288
Benjamin Peterson29060642009-01-31 22:14:21 +00001289 default:
1290 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001291 }
1292
Victor Stinner03490912011-10-03 23:45:12 +02001293 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001295 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297
1298 if (PyUnicode_IS_COMPACT(unicode)) {
1299 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300 }
1301 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001302 if (_PyUnicode_DATA_ANY(unicode))
1303 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 }
1306}
1307
Alexander Belopolsky40018472011-02-26 01:02:56 +00001308static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001309unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001310{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311 if (Py_REFCNT(unicode) != 1)
1312 return 0;
1313 if (PyUnicode_CHECK_INTERNED(unicode))
1314 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001315 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001316#ifdef Py_DEBUG
1317 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1318 && PyUnicode_GET_LENGTH(unicode) == 1)
1319 {
1320 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001321 if (ch < 256 && unicode_latin1[ch] == unicode)
1322 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001323 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001324#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001325 return 1;
1326}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001327
Victor Stinnerfe226c02011-10-03 03:52:20 +02001328static int
1329unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1330{
1331 PyObject *unicode;
1332 Py_ssize_t old_length;
1333
1334 assert(p_unicode != NULL);
1335 unicode = *p_unicode;
1336
1337 assert(unicode != NULL);
1338 assert(PyUnicode_Check(unicode));
1339 assert(0 <= length);
1340
Victor Stinner910337b2011-10-03 03:20:16 +02001341 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001342 old_length = PyUnicode_WSTR_LENGTH(unicode);
1343 else
1344 old_length = PyUnicode_GET_LENGTH(unicode);
1345 if (old_length == length)
1346 return 0;
1347
Victor Stinnerfe226c02011-10-03 03:52:20 +02001348 if (!unicode_resizable(unicode)) {
1349 PyObject *copy = resize_copy(unicode, length);
1350 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001352 Py_DECREF(*p_unicode);
1353 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001354 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001355 }
1356
Victor Stinnerfe226c02011-10-03 03:52:20 +02001357 if (PyUnicode_IS_COMPACT(unicode)) {
1358 *p_unicode = resize_compact(unicode, length);
1359 if (*p_unicode == NULL)
1360 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001361 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001362 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001363 }
1364 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001365}
1366
Alexander Belopolsky40018472011-02-26 01:02:56 +00001367int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001368PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001369{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370 PyObject *unicode;
1371 if (p_unicode == NULL) {
1372 PyErr_BadInternalCall();
1373 return -1;
1374 }
1375 unicode = *p_unicode;
1376 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1377 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1378 {
1379 PyErr_BadInternalCall();
1380 return -1;
1381 }
1382 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001383}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385static PyObject*
1386get_latin1_char(unsigned char ch)
1387{
Victor Stinnera464fc12011-10-02 20:39:30 +02001388 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode)
1392 return NULL;
1393 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1394 unicode_latin1[ch] = unicode;
1395 }
1396 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001397 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398}
1399
Alexander Belopolsky40018472011-02-26 01:02:56 +00001400PyObject *
1401PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402{
1403 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 Py_UCS4 maxchar = 0;
1405 Py_ssize_t num_surrogates;
1406
1407 if (u == NULL)
1408 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001410 /* If the Unicode data is known at construction time, we can apply
1411 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 /* Optimization for empty strings */
1414 if (size == 0 && unicode_empty != NULL) {
1415 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001416 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001417 }
Tim Petersced69f82003-09-16 20:30:58 +00001418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 /* Single character Unicode objects in the Latin-1 range are
1420 shared when using this constructor */
1421 if (size == 1 && *u < 256)
1422 return get_latin1_char((unsigned char)*u);
1423
1424 /* If not empty and not single character, copy the Unicode data
1425 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001426 if (find_maxchar_surrogates(u, u + size,
1427 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 return NULL;
1429
1430 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1431 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 if (!unicode)
1433 return NULL;
1434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435 switch (PyUnicode_KIND(unicode)) {
1436 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001437 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1439 break;
1440 case PyUnicode_2BYTE_KIND:
1441#if Py_UNICODE_SIZE == 2
1442 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1443#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001444 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1446#endif
1447 break;
1448 case PyUnicode_4BYTE_KIND:
1449#if SIZEOF_WCHAR_T == 2
1450 /* This is the only case which has to process surrogates, thus
1451 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001452 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453#else
1454 assert(num_surrogates == 0);
1455 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1456#endif
1457 break;
1458 default:
1459 assert(0 && "Impossible state");
1460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
1462 return (PyObject *)unicode;
1463}
1464
Alexander Belopolsky40018472011-02-26 01:02:56 +00001465PyObject *
1466PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001467{
1468 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001469
Benjamin Peterson14339b62009-01-31 16:36:08 +00001470 if (size < 0) {
1471 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001472 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001473 return NULL;
1474 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001475
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001476 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001477 some optimizations which share commonly used objects.
1478 Also, this means the input must be UTF-8, so fall back to the
1479 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001480 if (u != NULL) {
1481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 /* Optimization for empty strings */
1483 if (size == 0 && unicode_empty != NULL) {
1484 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001485 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001487
1488 /* Single characters are shared when using this constructor.
1489 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 if (size == 1 && Py_CHARMASK(*u) < 128)
1491 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001492
1493 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001494 }
1495
Walter Dörwald55507312007-05-18 13:12:10 +00001496 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001497 if (!unicode)
1498 return NULL;
1499
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001500 return (PyObject *)unicode;
1501}
1502
Alexander Belopolsky40018472011-02-26 01:02:56 +00001503PyObject *
1504PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001505{
1506 size_t size = strlen(u);
1507 if (size > PY_SSIZE_T_MAX) {
1508 PyErr_SetString(PyExc_OverflowError, "input too long");
1509 return NULL;
1510 }
1511
1512 return PyUnicode_FromStringAndSize(u, size);
1513}
1514
Victor Stinnere57b1c02011-09-28 22:20:48 +02001515static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001516unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1517{
1518 PyObject *res = PyUnicode_New(size, 127);
1519 if (!res)
1520 return NULL;
1521 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1522 return res;
1523}
1524
1525static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001526_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001529 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001531
1532 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 for (i = 0; i < size; i++) {
1534 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001535 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001537 }
1538 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001539 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 if (!res)
1541 return NULL;
1542 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1543 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001544}
1545
Victor Stinnere57b1c02011-09-28 22:20:48 +02001546static PyObject*
1547_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548{
1549 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001550 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001552
1553 assert(size >= 0);
1554 for (i = 0; i < size; i++) {
1555 if (u[i] > max_char) {
1556 max_char = u[i];
1557 if (max_char >= 256)
1558 break;
1559 }
1560 }
1561 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 if (!res)
1563 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001564 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1566 else
1567 for (i = 0; i < size; i++)
1568 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1569 return res;
1570}
1571
Victor Stinnere57b1c02011-09-28 22:20:48 +02001572static PyObject*
1573_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574{
1575 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001576 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001578
1579 assert(size >= 0);
1580 for (i = 0; i < size; i++) {
1581 if (u[i] > max_char) {
1582 max_char = u[i];
1583 if (max_char >= 0x10000)
1584 break;
1585 }
1586 }
1587 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 if (!res)
1589 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001590 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1592 else {
1593 int kind = PyUnicode_KIND(res);
1594 void *data = PyUnicode_DATA(res);
1595 for (i = 0; i < size; i++)
1596 PyUnicode_WRITE(kind, data, i, u[i]);
1597 }
1598 return res;
1599}
1600
1601PyObject*
1602PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1603{
1604 switch(kind) {
1605 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001606 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001608 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001610 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001611 default:
1612 assert(0 && "invalid kind");
1613 PyErr_SetString(PyExc_SystemError, "invalid kind");
1614 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616}
1617
Victor Stinner034f6cf2011-09-30 02:26:44 +02001618PyObject*
1619PyUnicode_Copy(PyObject *unicode)
1620{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001621 Py_ssize_t size;
1622 PyObject *copy;
1623 void *data;
1624
Victor Stinner034f6cf2011-09-30 02:26:44 +02001625 if (!PyUnicode_Check(unicode)) {
1626 PyErr_BadInternalCall();
1627 return NULL;
1628 }
1629 if (PyUnicode_READY(unicode))
1630 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001631
1632 size = PyUnicode_GET_LENGTH(unicode);
1633 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1634 if (!copy)
1635 return NULL;
1636 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1637
1638 data = PyUnicode_DATA(unicode);
1639 switch (PyUnicode_KIND(unicode))
1640 {
1641 case PyUnicode_1BYTE_KIND:
1642 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1643 break;
1644 case PyUnicode_2BYTE_KIND:
1645 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1646 break;
1647 case PyUnicode_4BYTE_KIND:
1648 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1649 break;
1650 default:
1651 assert(0);
1652 break;
1653 }
1654 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001655}
1656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Victor Stinnerbc603d12011-10-02 01:00:40 +02001658/* Widen Unicode objects to larger buffers. Don't write terminating null
1659 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660
1661void*
1662_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1663{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001664 Py_ssize_t len;
1665 void *result;
1666 unsigned int skind;
1667
1668 if (PyUnicode_READY(s))
1669 return NULL;
1670
1671 len = PyUnicode_GET_LENGTH(s);
1672 skind = PyUnicode_KIND(s);
1673 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001674 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 return NULL;
1676 }
1677 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001678 case PyUnicode_2BYTE_KIND:
1679 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1680 if (!result)
1681 return PyErr_NoMemory();
1682 assert(skind == PyUnicode_1BYTE_KIND);
1683 _PyUnicode_CONVERT_BYTES(
1684 Py_UCS1, Py_UCS2,
1685 PyUnicode_1BYTE_DATA(s),
1686 PyUnicode_1BYTE_DATA(s) + len,
1687 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001689 case PyUnicode_4BYTE_KIND:
1690 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1691 if (!result)
1692 return PyErr_NoMemory();
1693 if (skind == PyUnicode_2BYTE_KIND) {
1694 _PyUnicode_CONVERT_BYTES(
1695 Py_UCS2, Py_UCS4,
1696 PyUnicode_2BYTE_DATA(s),
1697 PyUnicode_2BYTE_DATA(s) + len,
1698 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001700 else {
1701 assert(skind == PyUnicode_1BYTE_KIND);
1702 _PyUnicode_CONVERT_BYTES(
1703 Py_UCS1, Py_UCS4,
1704 PyUnicode_1BYTE_DATA(s),
1705 PyUnicode_1BYTE_DATA(s) + len,
1706 result);
1707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001709 default:
1710 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
Victor Stinner01698042011-10-04 00:04:26 +02001712 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 return NULL;
1714}
1715
1716static Py_UCS4*
1717as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1718 int copy_null)
1719{
1720 int kind;
1721 void *data;
1722 Py_ssize_t len, targetlen;
1723 if (PyUnicode_READY(string) == -1)
1724 return NULL;
1725 kind = PyUnicode_KIND(string);
1726 data = PyUnicode_DATA(string);
1727 len = PyUnicode_GET_LENGTH(string);
1728 targetlen = len;
1729 if (copy_null)
1730 targetlen++;
1731 if (!target) {
1732 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1733 PyErr_NoMemory();
1734 return NULL;
1735 }
1736 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1737 if (!target) {
1738 PyErr_NoMemory();
1739 return NULL;
1740 }
1741 }
1742 else {
1743 if (targetsize < targetlen) {
1744 PyErr_Format(PyExc_SystemError,
1745 "string is longer than the buffer");
1746 if (copy_null && 0 < targetsize)
1747 target[0] = 0;
1748 return NULL;
1749 }
1750 }
1751 if (kind != PyUnicode_4BYTE_KIND) {
1752 Py_ssize_t i;
1753 for (i = 0; i < len; i++)
1754 target[i] = PyUnicode_READ(kind, data, i);
1755 }
1756 else
1757 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1758 if (copy_null)
1759 target[len] = 0;
1760 return target;
1761}
1762
1763Py_UCS4*
1764PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1765 int copy_null)
1766{
1767 if (target == NULL || targetsize < 1) {
1768 PyErr_BadInternalCall();
1769 return NULL;
1770 }
1771 return as_ucs4(string, target, targetsize, copy_null);
1772}
1773
1774Py_UCS4*
1775PyUnicode_AsUCS4Copy(PyObject *string)
1776{
1777 return as_ucs4(string, NULL, 0, 1);
1778}
1779
1780#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001781
Alexander Belopolsky40018472011-02-26 01:02:56 +00001782PyObject *
1783PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001786 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 PyErr_BadInternalCall();
1789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 }
1791
Martin v. Löwis790465f2008-04-05 20:41:37 +00001792 if (size == -1) {
1793 size = wcslen(w);
1794 }
1795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797}
1798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001800
Walter Dörwald346737f2007-05-31 10:44:43 +00001801static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001802makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1803 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 *fmt++ = '%';
1806 if (width) {
1807 if (zeropad)
1808 *fmt++ = '0';
1809 fmt += sprintf(fmt, "%d", width);
1810 }
1811 if (precision)
1812 fmt += sprintf(fmt, ".%d", precision);
1813 if (longflag)
1814 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001815 else if (longlongflag) {
1816 /* longlongflag should only ever be nonzero on machines with
1817 HAVE_LONG_LONG defined */
1818#ifdef HAVE_LONG_LONG
1819 char *f = PY_FORMAT_LONG_LONG;
1820 while (*f)
1821 *fmt++ = *f++;
1822#else
1823 /* we shouldn't ever get here */
1824 assert(0);
1825 *fmt++ = 'l';
1826#endif
1827 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 else if (size_tflag) {
1829 char *f = PY_FORMAT_SIZE_T;
1830 while (*f)
1831 *fmt++ = *f++;
1832 }
1833 *fmt++ = c;
1834 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001835}
1836
Victor Stinner96865452011-03-01 23:44:09 +00001837/* helper for PyUnicode_FromFormatV() */
1838
1839static const char*
1840parse_format_flags(const char *f,
1841 int *p_width, int *p_precision,
1842 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1843{
1844 int width, precision, longflag, longlongflag, size_tflag;
1845
1846 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1847 f++;
1848 width = 0;
1849 while (Py_ISDIGIT((unsigned)*f))
1850 width = (width*10) + *f++ - '0';
1851 precision = 0;
1852 if (*f == '.') {
1853 f++;
1854 while (Py_ISDIGIT((unsigned)*f))
1855 precision = (precision*10) + *f++ - '0';
1856 if (*f == '%') {
1857 /* "%.3%s" => f points to "3" */
1858 f--;
1859 }
1860 }
1861 if (*f == '\0') {
1862 /* bogus format "%.1" => go backward, f points to "1" */
1863 f--;
1864 }
1865 if (p_width != NULL)
1866 *p_width = width;
1867 if (p_precision != NULL)
1868 *p_precision = precision;
1869
1870 /* Handle %ld, %lu, %lld and %llu. */
1871 longflag = 0;
1872 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001873 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001874
1875 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001876 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001877 longflag = 1;
1878 ++f;
1879 }
1880#ifdef HAVE_LONG_LONG
1881 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001882 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001883 longlongflag = 1;
1884 f += 2;
1885 }
1886#endif
1887 }
1888 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001889 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001890 size_tflag = 1;
1891 ++f;
1892 }
1893 if (p_longflag != NULL)
1894 *p_longflag = longflag;
1895 if (p_longlongflag != NULL)
1896 *p_longlongflag = longlongflag;
1897 if (p_size_tflag != NULL)
1898 *p_size_tflag = size_tflag;
1899 return f;
1900}
1901
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001902/* maximum number of characters required for output of %ld. 21 characters
1903 allows for 64-bit integers (in decimal) and an optional sign. */
1904#define MAX_LONG_CHARS 21
1905/* maximum number of characters required for output of %lld.
1906 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1907 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1908#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1909
Walter Dörwaldd2034312007-05-18 16:29:38 +00001910PyObject *
1911PyUnicode_FromFormatV(const char *format, va_list vargs)
1912{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 va_list count;
1914 Py_ssize_t callcount = 0;
1915 PyObject **callresults = NULL;
1916 PyObject **callresult = NULL;
1917 Py_ssize_t n = 0;
1918 int width = 0;
1919 int precision = 0;
1920 int zeropad;
1921 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001923 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001924 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1926 Py_UCS4 argmaxchar;
1927 Py_ssize_t numbersize = 0;
1928 char *numberresults = NULL;
1929 char *numberresult = NULL;
1930 Py_ssize_t i;
1931 int kind;
1932 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001933
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001934 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001935 /* step 1: count the number of %S/%R/%A/%s format specifications
1936 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1937 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 * result in an array)
1939 * also esimate a upper bound for all the number formats in the string,
1940 * numbers will be formated in step 3 and be keept in a '\0'-separated
1941 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001942 for (f = format; *f; f++) {
1943 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001944 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1946 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1947 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1948 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001951#ifdef HAVE_LONG_LONG
1952 if (longlongflag) {
1953 if (width < MAX_LONG_LONG_CHARS)
1954 width = MAX_LONG_LONG_CHARS;
1955 }
1956 else
1957#endif
1958 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1959 including sign. Decimal takes the most space. This
1960 isn't enough for octal. If a width is specified we
1961 need more (which we allocate later). */
1962 if (width < MAX_LONG_CHARS)
1963 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964
1965 /* account for the size + '\0' to separate numbers
1966 inside of the numberresults buffer */
1967 numbersize += (width + 1);
1968 }
1969 }
1970 else if ((unsigned char)*f > 127) {
1971 PyErr_Format(PyExc_ValueError,
1972 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1973 "string, got a non-ASCII byte: 0x%02x",
1974 (unsigned char)*f);
1975 return NULL;
1976 }
1977 }
1978 /* step 2: allocate memory for the results of
1979 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1980 if (callcount) {
1981 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1982 if (!callresults) {
1983 PyErr_NoMemory();
1984 return NULL;
1985 }
1986 callresult = callresults;
1987 }
1988 /* step 2.5: allocate memory for the results of formating numbers */
1989 if (numbersize) {
1990 numberresults = PyObject_Malloc(numbersize);
1991 if (!numberresults) {
1992 PyErr_NoMemory();
1993 goto fail;
1994 }
1995 numberresult = numberresults;
1996 }
1997
1998 /* step 3: format numbers and figure out how large a buffer we need */
1999 for (f = format; *f; f++) {
2000 if (*f == '%') {
2001 const char* p;
2002 int longflag;
2003 int longlongflag;
2004 int size_tflag;
2005 int numprinted;
2006
2007 p = f;
2008 zeropad = (f[1] == '0');
2009 f = parse_format_flags(f, &width, &precision,
2010 &longflag, &longlongflag, &size_tflag);
2011 switch (*f) {
2012 case 'c':
2013 {
2014 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002015 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 n++;
2017 break;
2018 }
2019 case '%':
2020 n++;
2021 break;
2022 case 'i':
2023 case 'd':
2024 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2025 width, precision, *f);
2026 if (longflag)
2027 numprinted = sprintf(numberresult, fmt,
2028 va_arg(count, long));
2029#ifdef HAVE_LONG_LONG
2030 else if (longlongflag)
2031 numprinted = sprintf(numberresult, fmt,
2032 va_arg(count, PY_LONG_LONG));
2033#endif
2034 else if (size_tflag)
2035 numprinted = sprintf(numberresult, fmt,
2036 va_arg(count, Py_ssize_t));
2037 else
2038 numprinted = sprintf(numberresult, fmt,
2039 va_arg(count, int));
2040 n += numprinted;
2041 /* advance by +1 to skip over the '\0' */
2042 numberresult += (numprinted + 1);
2043 assert(*(numberresult - 1) == '\0');
2044 assert(*(numberresult - 2) != '\0');
2045 assert(numprinted >= 0);
2046 assert(numberresult <= numberresults + numbersize);
2047 break;
2048 case 'u':
2049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2050 width, precision, 'u');
2051 if (longflag)
2052 numprinted = sprintf(numberresult, fmt,
2053 va_arg(count, unsigned long));
2054#ifdef HAVE_LONG_LONG
2055 else if (longlongflag)
2056 numprinted = sprintf(numberresult, fmt,
2057 va_arg(count, unsigned PY_LONG_LONG));
2058#endif
2059 else if (size_tflag)
2060 numprinted = sprintf(numberresult, fmt,
2061 va_arg(count, size_t));
2062 else
2063 numprinted = sprintf(numberresult, fmt,
2064 va_arg(count, unsigned int));
2065 n += numprinted;
2066 numberresult += (numprinted + 1);
2067 assert(*(numberresult - 1) == '\0');
2068 assert(*(numberresult - 2) != '\0');
2069 assert(numprinted >= 0);
2070 assert(numberresult <= numberresults + numbersize);
2071 break;
2072 case 'x':
2073 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2074 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2075 n += numprinted;
2076 numberresult += (numprinted + 1);
2077 assert(*(numberresult - 1) == '\0');
2078 assert(*(numberresult - 2) != '\0');
2079 assert(numprinted >= 0);
2080 assert(numberresult <= numberresults + numbersize);
2081 break;
2082 case 'p':
2083 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2084 /* %p is ill-defined: ensure leading 0x. */
2085 if (numberresult[1] == 'X')
2086 numberresult[1] = 'x';
2087 else if (numberresult[1] != 'x') {
2088 memmove(numberresult + 2, numberresult,
2089 strlen(numberresult) + 1);
2090 numberresult[0] = '0';
2091 numberresult[1] = 'x';
2092 numprinted += 2;
2093 }
2094 n += numprinted;
2095 numberresult += (numprinted + 1);
2096 assert(*(numberresult - 1) == '\0');
2097 assert(*(numberresult - 2) != '\0');
2098 assert(numprinted >= 0);
2099 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002100 break;
2101 case 's':
2102 {
2103 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002104 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002105 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2106 if (!str)
2107 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 /* since PyUnicode_DecodeUTF8 returns already flexible
2109 unicode objects, there is no need to call ready on them */
2110 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002111 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002113 /* Remember the str and switch to the next slot */
2114 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 break;
2116 }
2117 case 'U':
2118 {
2119 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002120 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 if (PyUnicode_READY(obj) == -1)
2122 goto fail;
2123 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002124 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002126 break;
2127 }
2128 case 'V':
2129 {
2130 PyObject *obj = va_arg(count, PyObject *);
2131 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002132 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002133 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002134 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002135 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 if (PyUnicode_READY(obj) == -1)
2137 goto fail;
2138 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002139 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002141 *callresult++ = NULL;
2142 }
2143 else {
2144 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2145 if (!str_obj)
2146 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002147 if (PyUnicode_READY(str_obj)) {
2148 Py_DECREF(str_obj);
2149 goto fail;
2150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002152 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002154 *callresult++ = str_obj;
2155 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 break;
2157 }
2158 case 'S':
2159 {
2160 PyObject *obj = va_arg(count, PyObject *);
2161 PyObject *str;
2162 assert(obj);
2163 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002165 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002167 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 /* Remember the str and switch to the next slot */
2170 *callresult++ = str;
2171 break;
2172 }
2173 case 'R':
2174 {
2175 PyObject *obj = va_arg(count, PyObject *);
2176 PyObject *repr;
2177 assert(obj);
2178 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002180 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002182 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002184 /* Remember the repr and switch to the next slot */
2185 *callresult++ = repr;
2186 break;
2187 }
2188 case 'A':
2189 {
2190 PyObject *obj = va_arg(count, PyObject *);
2191 PyObject *ascii;
2192 assert(obj);
2193 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002197 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 /* Remember the repr and switch to the next slot */
2200 *callresult++ = ascii;
2201 break;
2202 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 default:
2204 /* if we stumble upon an unknown
2205 formatting code, copy the rest of
2206 the format string to the output
2207 string. (we cannot just skip the
2208 code, since there's no way to know
2209 what's in the argument list) */
2210 n += strlen(p);
2211 goto expand;
2212 }
2213 } else
2214 n++;
2215 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002216 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 we don't have to resize the string.
2220 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 if (!string)
2223 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 kind = PyUnicode_KIND(string);
2225 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002230 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002231 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002232
2233 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2235 /* checking for == because the last argument could be a empty
2236 string, which causes i to point to end, the assert at the end of
2237 the loop */
2238 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 switch (*f) {
2241 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002242 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 const int ordinal = va_arg(vargs, int);
2244 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002246 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002247 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 case 'p':
2252 /* unused, since we already have the result */
2253 if (*f == 'p')
2254 (void) va_arg(vargs, void *);
2255 else
2256 (void) va_arg(vargs, int);
2257 /* extract the result from numberresults and append. */
2258 for (; *numberresult; ++i, ++numberresult)
2259 PyUnicode_WRITE(kind, data, i, *numberresult);
2260 /* skip over the separating '\0' */
2261 assert(*numberresult == '\0');
2262 numberresult++;
2263 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 break;
2265 case 's':
2266 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002267 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002269 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 size = PyUnicode_GET_LENGTH(*callresult);
2271 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002272 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2273 *callresult, 0,
2274 size) < 0)
2275 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002277 /* We're done with the unicode()/repr() => forget it */
2278 Py_DECREF(*callresult);
2279 /* switch to next unicode()/repr() result */
2280 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 break;
2282 }
2283 case 'U':
2284 {
2285 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 Py_ssize_t size;
2287 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2288 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002289 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2290 obj, 0,
2291 size) < 0)
2292 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 break;
2295 }
2296 case 'V':
2297 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002300 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002301 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 size = PyUnicode_GET_LENGTH(obj);
2303 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002304 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2305 obj, 0,
2306 size) < 0)
2307 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002309 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 size = PyUnicode_GET_LENGTH(*callresult);
2311 assert(PyUnicode_KIND(*callresult) <=
2312 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002313 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2314 *callresult,
2315 0, size) < 0)
2316 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002318 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002320 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 break;
2322 }
2323 case 'S':
2324 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002325 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002326 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 /* unused, since we already have the result */
2328 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002330 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2331 *callresult, 0,
2332 PyUnicode_GET_LENGTH(*callresult)) < 0)
2333 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 /* We're done with the unicode()/repr() => forget it */
2336 Py_DECREF(*callresult);
2337 /* switch to next unicode()/repr() result */
2338 ++callresult;
2339 break;
2340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002343 break;
2344 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 for (; *p; ++p, ++i)
2346 PyUnicode_WRITE(kind, data, i, *p);
2347 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002348 goto end;
2349 }
Victor Stinner1205f272010-09-11 00:54:47 +00002350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 else {
2352 assert(i < PyUnicode_GET_LENGTH(string));
2353 PyUnicode_WRITE(kind, data, i++, *f);
2354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002357
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 if (callresults)
2360 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 if (numberresults)
2362 PyObject_Free(numberresults);
2363 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 if (callresults) {
2366 PyObject **callresult2 = callresults;
2367 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002368 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002369 ++callresult2;
2370 }
2371 PyObject_Free(callresults);
2372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 if (numberresults)
2374 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002376}
2377
Walter Dörwaldd2034312007-05-18 16:29:38 +00002378PyObject *
2379PyUnicode_FromFormat(const char *format, ...)
2380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002381 PyObject* ret;
2382 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002383
2384#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002386#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002388#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 ret = PyUnicode_FromFormatV(format, vargs);
2390 va_end(vargs);
2391 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002392}
2393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394#ifdef HAVE_WCHAR_H
2395
Victor Stinner5593d8a2010-10-02 11:11:27 +00002396/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2397 convert a Unicode object to a wide character string.
2398
Victor Stinnerd88d9832011-09-06 02:00:05 +02002399 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002400 character) required to convert the unicode object. Ignore size argument.
2401
Victor Stinnerd88d9832011-09-06 02:00:05 +02002402 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002403 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002404 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002405static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002406unicode_aswidechar(PyUnicodeObject *unicode,
2407 wchar_t *w,
2408 Py_ssize_t size)
2409{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002410 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 const wchar_t *wstr;
2412
2413 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2414 if (wstr == NULL)
2415 return -1;
2416
Victor Stinner5593d8a2010-10-02 11:11:27 +00002417 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002418 if (size > res)
2419 size = res + 1;
2420 else
2421 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002423 return res;
2424 }
2425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002427}
2428
2429Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002430PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002431 wchar_t *w,
2432 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433{
2434 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002435 PyErr_BadInternalCall();
2436 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002438 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439}
2440
Victor Stinner137c34c2010-09-29 10:25:54 +00002441wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002442PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002443 Py_ssize_t *size)
2444{
2445 wchar_t* buffer;
2446 Py_ssize_t buflen;
2447
2448 if (unicode == NULL) {
2449 PyErr_BadInternalCall();
2450 return NULL;
2451 }
2452
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002453 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (buflen == -1)
2455 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002456 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002457 PyErr_NoMemory();
2458 return NULL;
2459 }
2460
Victor Stinner137c34c2010-09-29 10:25:54 +00002461 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2462 if (buffer == NULL) {
2463 PyErr_NoMemory();
2464 return NULL;
2465 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002466 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 if (buflen == -1)
2468 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002469 if (size != NULL)
2470 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002471 return buffer;
2472}
2473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475
Alexander Belopolsky40018472011-02-26 01:02:56 +00002476PyObject *
2477PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002480 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 PyErr_SetString(PyExc_ValueError,
2482 "chr() arg not in range(0x110000)");
2483 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002484 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 if (ordinal < 256)
2487 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 v = PyUnicode_New(1, ordinal);
2490 if (v == NULL)
2491 return NULL;
2492 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2493 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002494}
2495
Alexander Belopolsky40018472011-02-26 01:02:56 +00002496PyObject *
2497PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002499 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002500 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002501 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002502 if (PyUnicode_READY(obj))
2503 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002504 Py_INCREF(obj);
2505 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002506 }
2507 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002508 /* For a Unicode subtype that's not a Unicode object,
2509 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002510 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002511 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002512 PyErr_Format(PyExc_TypeError,
2513 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002514 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002515 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002516}
2517
Alexander Belopolsky40018472011-02-26 01:02:56 +00002518PyObject *
2519PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002520 const char *encoding,
2521 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002522{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002523 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002524 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002525
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002527 PyErr_BadInternalCall();
2528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002530
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002531 /* Decoding bytes objects is the most common case and should be fast */
2532 if (PyBytes_Check(obj)) {
2533 if (PyBytes_GET_SIZE(obj) == 0) {
2534 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002535 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002536 }
2537 else {
2538 v = PyUnicode_Decode(
2539 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2540 encoding, errors);
2541 }
2542 return v;
2543 }
2544
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002545 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002546 PyErr_SetString(PyExc_TypeError,
2547 "decoding str is not supported");
2548 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002550
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002551 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2552 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2553 PyErr_Format(PyExc_TypeError,
2554 "coercing to str: need bytes, bytearray "
2555 "or buffer-like object, %.80s found",
2556 Py_TYPE(obj)->tp_name);
2557 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002558 }
Tim Petersced69f82003-09-16 20:30:58 +00002559
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002560 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002561 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002562 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
Tim Petersced69f82003-09-16 20:30:58 +00002564 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002565 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002566
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002567 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002568 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569}
2570
Victor Stinner600d3be2010-06-10 12:00:55 +00002571/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002572 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2573 1 on success. */
2574static int
2575normalize_encoding(const char *encoding,
2576 char *lower,
2577 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002579 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002580 char *l;
2581 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002583 e = encoding;
2584 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002585 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002586 while (*e) {
2587 if (l == l_end)
2588 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002589 if (Py_ISUPPER(*e)) {
2590 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002591 }
2592 else if (*e == '_') {
2593 *l++ = '-';
2594 e++;
2595 }
2596 else {
2597 *l++ = *e++;
2598 }
2599 }
2600 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002601 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002602}
2603
Alexander Belopolsky40018472011-02-26 01:02:56 +00002604PyObject *
2605PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002606 Py_ssize_t size,
2607 const char *encoding,
2608 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002609{
2610 PyObject *buffer = NULL, *unicode;
2611 Py_buffer info;
2612 char lower[11]; /* Enough for any encoding shortcut */
2613
2614 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002615 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002616
2617 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002618 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002619 if ((strcmp(lower, "utf-8") == 0) ||
2620 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002621 return PyUnicode_DecodeUTF8(s, size, errors);
2622 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002623 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002624 (strcmp(lower, "iso-8859-1") == 0))
2625 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002626#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002627 else if (strcmp(lower, "mbcs") == 0)
2628 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002629#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002630 else if (strcmp(lower, "ascii") == 0)
2631 return PyUnicode_DecodeASCII(s, size, errors);
2632 else if (strcmp(lower, "utf-16") == 0)
2633 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2634 else if (strcmp(lower, "utf-32") == 0)
2635 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002639 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002640 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002641 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002642 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 if (buffer == NULL)
2644 goto onError;
2645 unicode = PyCodec_Decode(buffer, encoding, errors);
2646 if (unicode == NULL)
2647 goto onError;
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002650 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002651 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 Py_DECREF(unicode);
2653 goto onError;
2654 }
2655 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002656#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002657 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 Py_DECREF(unicode);
2659 return NULL;
2660 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002661#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002663
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 Py_XDECREF(buffer);
2666 return NULL;
2667}
2668
Alexander Belopolsky40018472011-02-26 01:02:56 +00002669PyObject *
2670PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002671 const char *encoding,
2672 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002673{
2674 PyObject *v;
2675
2676 if (!PyUnicode_Check(unicode)) {
2677 PyErr_BadArgument();
2678 goto onError;
2679 }
2680
2681 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002683
2684 /* Decode via the codec registry */
2685 v = PyCodec_Decode(unicode, encoding, errors);
2686 if (v == NULL)
2687 goto onError;
2688 return v;
2689
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002691 return NULL;
2692}
2693
Alexander Belopolsky40018472011-02-26 01:02:56 +00002694PyObject *
2695PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002696 const char *encoding,
2697 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002698{
2699 PyObject *v;
2700
2701 if (!PyUnicode_Check(unicode)) {
2702 PyErr_BadArgument();
2703 goto onError;
2704 }
2705
2706 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002708
2709 /* Decode via the codec registry */
2710 v = PyCodec_Decode(unicode, encoding, errors);
2711 if (v == NULL)
2712 goto onError;
2713 if (!PyUnicode_Check(v)) {
2714 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002715 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002716 Py_TYPE(v)->tp_name);
2717 Py_DECREF(v);
2718 goto onError;
2719 }
2720 return v;
2721
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002723 return NULL;
2724}
2725
Alexander Belopolsky40018472011-02-26 01:02:56 +00002726PyObject *
2727PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002728 Py_ssize_t size,
2729 const char *encoding,
2730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731{
2732 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 unicode = PyUnicode_FromUnicode(s, size);
2735 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2738 Py_DECREF(unicode);
2739 return v;
2740}
2741
Alexander Belopolsky40018472011-02-26 01:02:56 +00002742PyObject *
2743PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002744 const char *encoding,
2745 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002746{
2747 PyObject *v;
2748
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 goto onError;
2752 }
2753
2754 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002756
2757 /* Encode via the codec registry */
2758 v = PyCodec_Encode(unicode, encoding, errors);
2759 if (v == NULL)
2760 goto onError;
2761 return v;
2762
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002764 return NULL;
2765}
2766
Victor Stinnerad158722010-10-27 00:25:46 +00002767PyObject *
2768PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002769{
Victor Stinner99b95382011-07-04 14:23:54 +02002770#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002771 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2772 PyUnicode_GET_SIZE(unicode),
2773 NULL);
2774#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002776#else
Victor Stinner793b5312011-04-27 00:24:21 +02002777 PyInterpreterState *interp = PyThreadState_GET()->interp;
2778 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2779 cannot use it to encode and decode filenames before it is loaded. Load
2780 the Python codec requires to encode at least its own filename. Use the C
2781 version of the locale codec until the codec registry is initialized and
2782 the Python codec is loaded.
2783
2784 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2785 cannot only rely on it: check also interp->fscodec_initialized for
2786 subinterpreters. */
2787 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002788 return PyUnicode_AsEncodedString(unicode,
2789 Py_FileSystemDefaultEncoding,
2790 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002791 }
2792 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002793 /* locale encoding with surrogateescape */
2794 wchar_t *wchar;
2795 char *bytes;
2796 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002797 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002798
2799 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2800 if (wchar == NULL)
2801 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002802 bytes = _Py_wchar2char(wchar, &error_pos);
2803 if (bytes == NULL) {
2804 if (error_pos != (size_t)-1) {
2805 char *errmsg = strerror(errno);
2806 PyObject *exc = NULL;
2807 if (errmsg == NULL)
2808 errmsg = "Py_wchar2char() failed";
2809 raise_encode_exception(&exc,
2810 "filesystemencoding",
2811 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2812 error_pos, error_pos+1,
2813 errmsg);
2814 Py_XDECREF(exc);
2815 }
2816 else
2817 PyErr_NoMemory();
2818 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002819 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002820 }
2821 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002822
2823 bytes_obj = PyBytes_FromString(bytes);
2824 PyMem_Free(bytes);
2825 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002826 }
Victor Stinnerad158722010-10-27 00:25:46 +00002827#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002828}
2829
Alexander Belopolsky40018472011-02-26 01:02:56 +00002830PyObject *
2831PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002832 const char *encoding,
2833 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834{
2835 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002836 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 if (!PyUnicode_Check(unicode)) {
2839 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 }
Fred Drakee4315f52000-05-09 19:53:39 +00002842
Victor Stinner2f283c22011-03-02 01:21:46 +00002843 if (encoding == NULL) {
2844 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002845 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002846 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002847 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002848 }
Fred Drakee4315f52000-05-09 19:53:39 +00002849
2850 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002851 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002852 if ((strcmp(lower, "utf-8") == 0) ||
2853 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002854 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002855 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002857 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002859 }
Victor Stinner37296e82010-06-10 13:36:23 +00002860 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002861 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002862 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002864#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002865 else if (strcmp(lower, "mbcs") == 0)
2866 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2867 PyUnicode_GET_SIZE(unicode),
2868 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002869#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002870 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002871 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
2874 /* Encode via the codec registry */
2875 v = PyCodec_Encode(unicode, encoding, errors);
2876 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002877 return NULL;
2878
2879 /* The normal path */
2880 if (PyBytes_Check(v))
2881 return v;
2882
2883 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002884 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002885 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002886 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002887
2888 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2889 "encoder %s returned bytearray instead of bytes",
2890 encoding);
2891 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002892 Py_DECREF(v);
2893 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002894 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002895
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002896 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2897 Py_DECREF(v);
2898 return b;
2899 }
2900
2901 PyErr_Format(PyExc_TypeError,
2902 "encoder did not return a bytes object (type=%.400s)",
2903 Py_TYPE(v)->tp_name);
2904 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002905 return NULL;
2906}
2907
Alexander Belopolsky40018472011-02-26 01:02:56 +00002908PyObject *
2909PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002910 const char *encoding,
2911 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002912{
2913 PyObject *v;
2914
2915 if (!PyUnicode_Check(unicode)) {
2916 PyErr_BadArgument();
2917 goto onError;
2918 }
2919
2920 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002922
2923 /* Encode via the codec registry */
2924 v = PyCodec_Encode(unicode, encoding, errors);
2925 if (v == NULL)
2926 goto onError;
2927 if (!PyUnicode_Check(v)) {
2928 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002929 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002930 Py_TYPE(v)->tp_name);
2931 Py_DECREF(v);
2932 goto onError;
2933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002935
Benjamin Peterson29060642009-01-31 22:14:21 +00002936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 return NULL;
2938}
2939
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002940PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002941PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002942 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002943 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2944}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002945
Christian Heimes5894ba72007-11-04 11:43:14 +00002946PyObject*
2947PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2948{
Victor Stinner99b95382011-07-04 14:23:54 +02002949#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002950 return PyUnicode_DecodeMBCS(s, size, NULL);
2951#elif defined(__APPLE__)
2952 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2953#else
Victor Stinner793b5312011-04-27 00:24:21 +02002954 PyInterpreterState *interp = PyThreadState_GET()->interp;
2955 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2956 cannot use it to encode and decode filenames before it is loaded. Load
2957 the Python codec requires to encode at least its own filename. Use the C
2958 version of the locale codec until the codec registry is initialized and
2959 the Python codec is loaded.
2960
2961 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2962 cannot only rely on it: check also interp->fscodec_initialized for
2963 subinterpreters. */
2964 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002965 return PyUnicode_Decode(s, size,
2966 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002967 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002968 }
2969 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002970 /* locale encoding with surrogateescape */
2971 wchar_t *wchar;
2972 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002973 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002974
2975 if (s[size] != '\0' || size != strlen(s)) {
2976 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2977 return NULL;
2978 }
2979
Victor Stinner168e1172010-10-16 23:16:16 +00002980 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002981 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002982 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002983
Victor Stinner168e1172010-10-16 23:16:16 +00002984 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002985 PyMem_Free(wchar);
2986 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002987 }
Victor Stinnerad158722010-10-27 00:25:46 +00002988#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002989}
2990
Martin v. Löwis011e8422009-05-05 04:43:17 +00002991
2992int
2993PyUnicode_FSConverter(PyObject* arg, void* addr)
2994{
2995 PyObject *output = NULL;
2996 Py_ssize_t size;
2997 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002998 if (arg == NULL) {
2999 Py_DECREF(*(PyObject**)addr);
3000 return 1;
3001 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003002 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003003 output = arg;
3004 Py_INCREF(output);
3005 }
3006 else {
3007 arg = PyUnicode_FromObject(arg);
3008 if (!arg)
3009 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003010 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003011 Py_DECREF(arg);
3012 if (!output)
3013 return 0;
3014 if (!PyBytes_Check(output)) {
3015 Py_DECREF(output);
3016 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3017 return 0;
3018 }
3019 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003020 size = PyBytes_GET_SIZE(output);
3021 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003022 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003023 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003024 Py_DECREF(output);
3025 return 0;
3026 }
3027 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003028 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003029}
3030
3031
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003032int
3033PyUnicode_FSDecoder(PyObject* arg, void* addr)
3034{
3035 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003036 if (arg == NULL) {
3037 Py_DECREF(*(PyObject**)addr);
3038 return 1;
3039 }
3040 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003041 if (PyUnicode_READY(arg))
3042 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003043 output = arg;
3044 Py_INCREF(output);
3045 }
3046 else {
3047 arg = PyBytes_FromObject(arg);
3048 if (!arg)
3049 return 0;
3050 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3051 PyBytes_GET_SIZE(arg));
3052 Py_DECREF(arg);
3053 if (!output)
3054 return 0;
3055 if (!PyUnicode_Check(output)) {
3056 Py_DECREF(output);
3057 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3058 return 0;
3059 }
3060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003061 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3062 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003063 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3064 Py_DECREF(output);
3065 return 0;
3066 }
3067 *(PyObject**)addr = output;
3068 return Py_CLEANUP_SUPPORTED;
3069}
3070
3071
Martin v. Löwis5b222132007-06-10 09:51:05 +00003072char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003073PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003074{
Christian Heimesf3863112007-11-22 07:46:41 +00003075 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3077
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003078 if (!PyUnicode_Check(unicode)) {
3079 PyErr_BadArgument();
3080 return NULL;
3081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003083 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003084
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003085 if (PyUnicode_UTF8(unicode) == NULL) {
3086 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3088 if (bytes == NULL)
3089 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003090 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3091 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 Py_DECREF(bytes);
3093 return NULL;
3094 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003095 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3096 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003097 Py_DECREF(bytes);
3098 }
3099
3100 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003101 *psize = PyUnicode_UTF8_LENGTH(unicode);
3102 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003103}
3104
3105char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3109}
3110
3111#ifdef Py_DEBUG
3112int unicode_as_unicode_calls = 0;
3113#endif
3114
3115
3116Py_UNICODE *
3117PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3118{
3119 PyUnicodeObject *u;
3120 const unsigned char *one_byte;
3121#if SIZEOF_WCHAR_T == 4
3122 const Py_UCS2 *two_bytes;
3123#else
3124 const Py_UCS4 *four_bytes;
3125 const Py_UCS4 *ucs4_end;
3126 Py_ssize_t num_surrogates;
3127#endif
3128 wchar_t *w;
3129 wchar_t *wchar_end;
3130
3131 if (!PyUnicode_Check(unicode)) {
3132 PyErr_BadArgument();
3133 return NULL;
3134 }
3135 u = (PyUnicodeObject*)unicode;
3136 if (_PyUnicode_WSTR(u) == NULL) {
3137 /* Non-ASCII compact unicode object */
3138 assert(_PyUnicode_KIND(u) != 0);
3139 assert(PyUnicode_IS_READY(u));
3140
3141#ifdef Py_DEBUG
3142 ++unicode_as_unicode_calls;
3143#endif
3144
3145 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3146#if SIZEOF_WCHAR_T == 2
3147 four_bytes = PyUnicode_4BYTE_DATA(u);
3148 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3149 num_surrogates = 0;
3150
3151 for (; four_bytes < ucs4_end; ++four_bytes) {
3152 if (*four_bytes > 0xFFFF)
3153 ++num_surrogates;
3154 }
3155
3156 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3157 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3158 if (!_PyUnicode_WSTR(u)) {
3159 PyErr_NoMemory();
3160 return NULL;
3161 }
3162 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3163
3164 w = _PyUnicode_WSTR(u);
3165 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3166 four_bytes = PyUnicode_4BYTE_DATA(u);
3167 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3168 if (*four_bytes > 0xFFFF) {
3169 /* encode surrogate pair in this case */
3170 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3171 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3172 }
3173 else
3174 *w = *four_bytes;
3175
3176 if (w > wchar_end) {
3177 assert(0 && "Miscalculated string end");
3178 }
3179 }
3180 *w = 0;
3181#else
3182 /* sizeof(wchar_t) == 4 */
3183 Py_FatalError("Impossible unicode object state, wstr and str "
3184 "should share memory already.");
3185 return NULL;
3186#endif
3187 }
3188 else {
3189 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3190 (_PyUnicode_LENGTH(u) + 1));
3191 if (!_PyUnicode_WSTR(u)) {
3192 PyErr_NoMemory();
3193 return NULL;
3194 }
3195 if (!PyUnicode_IS_COMPACT_ASCII(u))
3196 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3197 w = _PyUnicode_WSTR(u);
3198 wchar_end = w + _PyUnicode_LENGTH(u);
3199
3200 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3201 one_byte = PyUnicode_1BYTE_DATA(u);
3202 for (; w < wchar_end; ++one_byte, ++w)
3203 *w = *one_byte;
3204 /* null-terminate the wstr */
3205 *w = 0;
3206 }
3207 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3208#if SIZEOF_WCHAR_T == 4
3209 two_bytes = PyUnicode_2BYTE_DATA(u);
3210 for (; w < wchar_end; ++two_bytes, ++w)
3211 *w = *two_bytes;
3212 /* null-terminate the wstr */
3213 *w = 0;
3214#else
3215 /* sizeof(wchar_t) == 2 */
3216 PyObject_FREE(_PyUnicode_WSTR(u));
3217 _PyUnicode_WSTR(u) = NULL;
3218 Py_FatalError("Impossible unicode object state, wstr "
3219 "and str should share memory already.");
3220 return NULL;
3221#endif
3222 }
3223 else {
3224 assert(0 && "This should never happen.");
3225 }
3226 }
3227 }
3228 if (size != NULL)
3229 *size = PyUnicode_WSTR_LENGTH(u);
3230 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003231}
3232
Alexander Belopolsky40018472011-02-26 01:02:56 +00003233Py_UNICODE *
3234PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003236 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237}
3238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240Py_ssize_t
3241PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242{
3243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
3245 goto onError;
3246 }
3247 return PyUnicode_GET_SIZE(unicode);
3248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return -1;
3251}
3252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253Py_ssize_t
3254PyUnicode_GetLength(PyObject *unicode)
3255{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003256 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 PyErr_BadArgument();
3258 return -1;
3259 }
3260
3261 return PyUnicode_GET_LENGTH(unicode);
3262}
3263
3264Py_UCS4
3265PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3266{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003267 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3268 PyErr_BadArgument();
3269 return (Py_UCS4)-1;
3270 }
3271 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3272 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 return (Py_UCS4)-1;
3274 }
3275 return PyUnicode_READ_CHAR(unicode, index);
3276}
3277
3278int
3279PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3280{
3281 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003282 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return -1;
3284 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003285 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3286 PyErr_SetString(PyExc_IndexError, "string index out of range");
3287 return -1;
3288 }
3289 if (_PyUnicode_Dirty(unicode))
3290 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3292 index, ch);
3293 return 0;
3294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296const char *
3297PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003298{
Victor Stinner42cb4622010-09-01 19:39:01 +00003299 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003300}
3301
Victor Stinner554f3f02010-06-16 23:33:54 +00003302/* create or adjust a UnicodeDecodeError */
3303static void
3304make_decode_exception(PyObject **exceptionObject,
3305 const char *encoding,
3306 const char *input, Py_ssize_t length,
3307 Py_ssize_t startpos, Py_ssize_t endpos,
3308 const char *reason)
3309{
3310 if (*exceptionObject == NULL) {
3311 *exceptionObject = PyUnicodeDecodeError_Create(
3312 encoding, input, length, startpos, endpos, reason);
3313 }
3314 else {
3315 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3316 goto onError;
3317 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3318 goto onError;
3319 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3320 goto onError;
3321 }
3322 return;
3323
3324onError:
3325 Py_DECREF(*exceptionObject);
3326 *exceptionObject = NULL;
3327}
3328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329/* error handling callback helper:
3330 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003331 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 and adjust various state variables.
3333 return 0 on success, -1 on error
3334*/
3335
Alexander Belopolsky40018472011-02-26 01:02:56 +00003336static int
3337unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003338 const char *encoding, const char *reason,
3339 const char **input, const char **inend, Py_ssize_t *startinpos,
3340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3341 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003343 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344
3345 PyObject *restuple = NULL;
3346 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003348 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t requiredsize;
3350 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003352 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003353 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 int res = -1;
3355
3356 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003357 *errorHandler = PyCodec_LookupError(errors);
3358 if (*errorHandler == NULL)
3359 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 }
3361
Victor Stinner554f3f02010-06-16 23:33:54 +00003362 make_decode_exception(exceptionObject,
3363 encoding,
3364 *input, *inend - *input,
3365 *startinpos, *endinpos,
3366 reason);
3367 if (*exceptionObject == NULL)
3368 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369
3370 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3371 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003374 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 }
3377 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003379
3380 /* Copy back the bytes variables, which might have been modified by the
3381 callback */
3382 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3383 if (!inputobj)
3384 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003385 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003387 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003388 *input = PyBytes_AS_STRING(inputobj);
3389 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003390 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003391 /* we can DECREF safely, as the exception has another reference,
3392 so the object won't go away. */
3393 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003397 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3399 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401
3402 /* need more space? (at least enough for what we
3403 have+the replacement+the rest of the string (starting
3404 at the new input position), so we won't have to check space
3405 when there are no errors in the rest of the string) */
3406 repptr = PyUnicode_AS_UNICODE(repunicode);
3407 repsize = PyUnicode_GET_SIZE(repunicode);
3408 requiredsize = *outpos + repsize + insize-newpos;
3409 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 if (requiredsize<2*outsize)
3411 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003412 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 goto onError;
3414 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 }
3416 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003417 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 Py_UNICODE_COPY(*outptr, repptr, repsize);
3419 *outptr += repsize;
3420 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 /* we made it! */
3423 res = 0;
3424
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 Py_XDECREF(restuple);
3427 return res;
3428}
3429
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003430/* --- UTF-7 Codec -------------------------------------------------------- */
3431
Antoine Pitrou244651a2009-05-04 18:56:13 +00003432/* See RFC2152 for details. We encode conservatively and decode liberally. */
3433
3434/* Three simple macros defining base-64. */
3435
3436/* Is c a base-64 character? */
3437
3438#define IS_BASE64(c) \
3439 (((c) >= 'A' && (c) <= 'Z') || \
3440 ((c) >= 'a' && (c) <= 'z') || \
3441 ((c) >= '0' && (c) <= '9') || \
3442 (c) == '+' || (c) == '/')
3443
3444/* given that c is a base-64 character, what is its base-64 value? */
3445
3446#define FROM_BASE64(c) \
3447 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3448 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3449 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3450 (c) == '+' ? 62 : 63)
3451
3452/* What is the base-64 character of the bottom 6 bits of n? */
3453
3454#define TO_BASE64(n) \
3455 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3456
3457/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3458 * decoded as itself. We are permissive on decoding; the only ASCII
3459 * byte not decoding to itself is the + which begins a base64
3460 * string. */
3461
3462#define DECODE_DIRECT(c) \
3463 ((c) <= 127 && (c) != '+')
3464
3465/* The UTF-7 encoder treats ASCII characters differently according to
3466 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3467 * the above). See RFC2152. This array identifies these different
3468 * sets:
3469 * 0 : "Set D"
3470 * alphanumeric and '(),-./:?
3471 * 1 : "Set O"
3472 * !"#$%&*;<=>@[]^_`{|}
3473 * 2 : "whitespace"
3474 * ht nl cr sp
3475 * 3 : special (must be base64 encoded)
3476 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3477 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003478
Tim Petersced69f82003-09-16 20:30:58 +00003479static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003480char utf7_category[128] = {
3481/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3482 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3483/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3484 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3485/* sp ! " # $ % & ' ( ) * + , - . / */
3486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3487/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3489/* @ A B C D E F G H I J K L M N O */
3490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3491/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3493/* ` a b c d e f g h i j k l m n o */
3494 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3495/* p q r s t u v w x y z { | } ~ del */
3496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003497};
3498
Antoine Pitrou244651a2009-05-04 18:56:13 +00003499/* ENCODE_DIRECT: this character should be encoded as itself. The
3500 * answer depends on whether we are encoding set O as itself, and also
3501 * on whether we are encoding whitespace as itself. RFC2152 makes it
3502 * clear that the answers to these questions vary between
3503 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003504
Antoine Pitrou244651a2009-05-04 18:56:13 +00003505#define ENCODE_DIRECT(c, directO, directWS) \
3506 ((c) < 128 && (c) > 0 && \
3507 ((utf7_category[(c)] == 0) || \
3508 (directWS && (utf7_category[(c)] == 2)) || \
3509 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 Py_ssize_t size,
3514 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003515{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003516 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3517}
3518
Antoine Pitrou244651a2009-05-04 18:56:13 +00003519/* The decoder. The only state we preserve is our read position,
3520 * i.e. how many characters we have consumed. So if we end in the
3521 * middle of a shift sequence we have to back off the read position
3522 * and the output to the beginning of the sequence, otherwise we lose
3523 * all the shift state (seen bits, number of bits seen, high
3524 * surrogate). */
3525
Alexander Belopolsky40018472011-02-26 01:02:56 +00003526PyObject *
3527PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003528 Py_ssize_t size,
3529 const char *errors,
3530 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003533 Py_ssize_t startinpos;
3534 Py_ssize_t endinpos;
3535 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003536 const char *e;
3537 PyUnicodeObject *unicode;
3538 Py_UNICODE *p;
3539 const char *errmsg = "";
3540 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003541 Py_UNICODE *shiftOutStart;
3542 unsigned int base64bits = 0;
3543 unsigned long base64buffer = 0;
3544 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 PyObject *errorHandler = NULL;
3546 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003547
3548 unicode = _PyUnicode_New(size);
3549 if (!unicode)
3550 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003551 if (size == 0) {
3552 if (consumed)
3553 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003555 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003557 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003558 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003559 e = s + size;
3560
3561 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003564 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003565
Antoine Pitrou244651a2009-05-04 18:56:13 +00003566 if (inShift) { /* in a base-64 section */
3567 if (IS_BASE64(ch)) { /* consume a base-64 character */
3568 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3569 base64bits += 6;
3570 s++;
3571 if (base64bits >= 16) {
3572 /* we have enough bits for a UTF-16 value */
3573 Py_UNICODE outCh = (Py_UNICODE)
3574 (base64buffer >> (base64bits-16));
3575 base64bits -= 16;
3576 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3577 if (surrogate) {
3578 /* expecting a second surrogate */
3579 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3580#ifdef Py_UNICODE_WIDE
3581 *p++ = (((surrogate & 0x3FF)<<10)
3582 | (outCh & 0x3FF)) + 0x10000;
3583#else
3584 *p++ = surrogate;
3585 *p++ = outCh;
3586#endif
3587 surrogate = 0;
3588 }
3589 else {
3590 surrogate = 0;
3591 errmsg = "second surrogate missing";
3592 goto utf7Error;
3593 }
3594 }
3595 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3596 /* first surrogate */
3597 surrogate = outCh;
3598 }
3599 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3600 errmsg = "unexpected second surrogate";
3601 goto utf7Error;
3602 }
3603 else {
3604 *p++ = outCh;
3605 }
3606 }
3607 }
3608 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003609 inShift = 0;
3610 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003611 if (surrogate) {
3612 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003613 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 if (base64bits > 0) { /* left-over bits */
3616 if (base64bits >= 6) {
3617 /* We've seen at least one base-64 character */
3618 errmsg = "partial character in shift sequence";
3619 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003621 else {
3622 /* Some bits remain; they should be zero */
3623 if (base64buffer != 0) {
3624 errmsg = "non-zero padding bits in shift sequence";
3625 goto utf7Error;
3626 }
3627 }
3628 }
3629 if (ch != '-') {
3630 /* '-' is absorbed; other terminating
3631 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003632 *p++ = ch;
3633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003634 }
3635 }
3636 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003638 s++; /* consume '+' */
3639 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003640 s++;
3641 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003642 }
3643 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003644 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003645 shiftOutStart = p;
3646 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003647 }
3648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003649 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003650 *p++ = ch;
3651 s++;
3652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003653 else {
3654 startinpos = s-starts;
3655 s++;
3656 errmsg = "unexpected special character";
3657 goto utf7Error;
3658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003659 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 outpos = p-PyUnicode_AS_UNICODE(unicode);
3662 endinpos = s-starts;
3663 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 errors, &errorHandler,
3665 "utf7", errmsg,
3666 &starts, &e, &startinpos, &endinpos, &exc, &s,
3667 &unicode, &outpos, &p))
3668 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669 }
3670
Antoine Pitrou244651a2009-05-04 18:56:13 +00003671 /* end of string */
3672
3673 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3674 /* if we're in an inconsistent state, that's an error */
3675 if (surrogate ||
3676 (base64bits >= 6) ||
3677 (base64bits > 0 && base64buffer != 0)) {
3678 outpos = p-PyUnicode_AS_UNICODE(unicode);
3679 endinpos = size;
3680 if (unicode_decode_call_errorhandler(
3681 errors, &errorHandler,
3682 "utf7", "unterminated shift sequence",
3683 &starts, &e, &startinpos, &endinpos, &exc, &s,
3684 &unicode, &outpos, &p))
3685 goto onError;
3686 if (s < e)
3687 goto restart;
3688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003690
3691 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003692 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003693 if (inShift) {
3694 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003695 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003696 }
3697 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003698 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003699 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003700 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701
Victor Stinnerfe226c02011-10-03 03:52:20 +02003702 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003703 goto onError;
3704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 Py_XDECREF(errorHandler);
3706 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003707#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003708 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 Py_DECREF(unicode);
3710 return NULL;
3711 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003712#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003713 return (PyObject *)unicode;
3714
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 Py_XDECREF(errorHandler);
3717 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718 Py_DECREF(unicode);
3719 return NULL;
3720}
3721
3722
Alexander Belopolsky40018472011-02-26 01:02:56 +00003723PyObject *
3724PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003725 Py_ssize_t size,
3726 int base64SetO,
3727 int base64WhiteSpace,
3728 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003730 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003732 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003734 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003735 unsigned int base64bits = 0;
3736 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003737 char * out;
3738 char * start;
3739
3740 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003742
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003743 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003744 return PyErr_NoMemory();
3745
Antoine Pitrou244651a2009-05-04 18:56:13 +00003746 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003747 if (v == NULL)
3748 return NULL;
3749
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003750 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003751 for (;i < size; ++i) {
3752 Py_UNICODE ch = s[i];
3753
Antoine Pitrou244651a2009-05-04 18:56:13 +00003754 if (inShift) {
3755 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3756 /* shifting out */
3757 if (base64bits) { /* output remaining bits */
3758 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3759 base64buffer = 0;
3760 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003761 }
3762 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003763 /* Characters not in the BASE64 set implicitly unshift the sequence
3764 so no '-' is required, except if the character is itself a '-' */
3765 if (IS_BASE64(ch) || ch == '-') {
3766 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003767 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003768 *out++ = (char) ch;
3769 }
3770 else {
3771 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003772 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003773 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774 else { /* not in a shift sequence */
3775 if (ch == '+') {
3776 *out++ = '+';
3777 *out++ = '-';
3778 }
3779 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3780 *out++ = (char) ch;
3781 }
3782 else {
3783 *out++ = '+';
3784 inShift = 1;
3785 goto encode_char;
3786 }
3787 }
3788 continue;
3789encode_char:
3790#ifdef Py_UNICODE_WIDE
3791 if (ch >= 0x10000) {
3792 /* code first surrogate */
3793 base64bits += 16;
3794 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3795 while (base64bits >= 6) {
3796 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3797 base64bits -= 6;
3798 }
3799 /* prepare second surrogate */
3800 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3801 }
3802#endif
3803 base64bits += 16;
3804 base64buffer = (base64buffer << 16) | ch;
3805 while (base64bits >= 6) {
3806 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3807 base64bits -= 6;
3808 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003810 if (base64bits)
3811 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3812 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003814 if (_PyBytes_Resize(&v, out - start) < 0)
3815 return NULL;
3816 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817}
3818
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819#undef IS_BASE64
3820#undef FROM_BASE64
3821#undef TO_BASE64
3822#undef DECODE_DIRECT
3823#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003824
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825/* --- UTF-8 Codec -------------------------------------------------------- */
3826
Tim Petersced69f82003-09-16 20:30:58 +00003827static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003829 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3830 illegal prefix. See RFC 3629 for details */
3831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3835 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3837 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3843 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3845 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3846 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847};
3848
Alexander Belopolsky40018472011-02-26 01:02:56 +00003849PyObject *
3850PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003851 Py_ssize_t size,
3852 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
Walter Dörwald69652032004-09-07 20:24:22 +00003854 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3855}
3856
Antoine Pitrouab868312009-01-10 15:40:25 +00003857/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3858#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3859
3860/* Mask to quickly check whether a C 'long' contains a
3861 non-ASCII, UTF8-encoded char. */
3862#if (SIZEOF_LONG == 8)
3863# define ASCII_CHAR_MASK 0x8080808080808080L
3864#elif (SIZEOF_LONG == 4)
3865# define ASCII_CHAR_MASK 0x80808080L
3866#else
3867# error C 'long' size should be either 4 or 8!
3868#endif
3869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870/* Scans a UTF-8 string and returns the maximum character to be expected,
3871 the size of the decoded unicode string and if any major errors were
3872 encountered.
3873
3874 This function does check basic UTF-8 sanity, it does however NOT CHECK
3875 if the string contains surrogates, and if all continuation bytes are
3876 within the correct ranges, these checks are performed in
3877 PyUnicode_DecodeUTF8Stateful.
3878
3879 If it sets has_errors to 1, it means the value of unicode_size and max_char
3880 will be bogus and you should not rely on useful information in them.
3881 */
3882static Py_UCS4
3883utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3884 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3885 int *has_errors)
3886{
3887 Py_ssize_t n;
3888 Py_ssize_t char_count = 0;
3889 Py_UCS4 max_char = 127, new_max;
3890 Py_UCS4 upper_bound;
3891 const unsigned char *p = (const unsigned char *)s;
3892 const unsigned char *end = p + string_size;
3893 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3894 int err = 0;
3895
3896 for (; p < end && !err; ++p, ++char_count) {
3897 /* Only check value if it's not a ASCII char... */
3898 if (*p < 0x80) {
3899 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3900 an explanation. */
3901 if (!((size_t) p & LONG_PTR_MASK)) {
3902 /* Help register allocation */
3903 register const unsigned char *_p = p;
3904 while (_p < aligned_end) {
3905 unsigned long value = *(unsigned long *) _p;
3906 if (value & ASCII_CHAR_MASK)
3907 break;
3908 _p += SIZEOF_LONG;
3909 char_count += SIZEOF_LONG;
3910 }
3911 p = _p;
3912 if (p == end)
3913 break;
3914 }
3915 }
3916 if (*p >= 0x80) {
3917 n = utf8_code_length[*p];
3918 new_max = max_char;
3919 switch (n) {
3920 /* invalid start byte */
3921 case 0:
3922 err = 1;
3923 break;
3924 case 2:
3925 /* Code points between 0x00FF and 0x07FF inclusive.
3926 Approximate the upper bound of the code point,
3927 if this flips over 255 we can be sure it will be more
3928 than 255 and the string will need 2 bytes per code coint,
3929 if it stays under or equal to 255, we can be sure 1 byte
3930 is enough.
3931 ((*p & 0b00011111) << 6) | 0b00111111 */
3932 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3933 if (max_char < upper_bound)
3934 new_max = upper_bound;
3935 /* Ensure we track at least that we left ASCII space. */
3936 if (new_max < 128)
3937 new_max = 128;
3938 break;
3939 case 3:
3940 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3941 always > 255 and <= 65535 and will always need 2 bytes. */
3942 if (max_char < 65535)
3943 new_max = 65535;
3944 break;
3945 case 4:
3946 /* Code point will be above 0xFFFF for sure in this case. */
3947 new_max = 65537;
3948 break;
3949 /* Internal error, this should be caught by the first if */
3950 case 1:
3951 default:
3952 assert(0 && "Impossible case in utf8_max_char_and_size");
3953 err = 1;
3954 }
3955 /* Instead of number of overall bytes for this code point,
3956 n containts the number of following bytes: */
3957 --n;
3958 /* Check if the follow up chars are all valid continuation bytes */
3959 if (n >= 1) {
3960 const unsigned char *cont;
3961 if ((p + n) >= end) {
3962 if (consumed == 0)
3963 /* incomplete data, non-incremental decoding */
3964 err = 1;
3965 break;
3966 }
3967 for (cont = p + 1; cont < (p + n); ++cont) {
3968 if ((*cont & 0xc0) != 0x80) {
3969 err = 1;
3970 break;
3971 }
3972 }
3973 p += n;
3974 }
3975 else
3976 err = 1;
3977 max_char = new_max;
3978 }
3979 }
3980
3981 if (unicode_size)
3982 *unicode_size = char_count;
3983 if (has_errors)
3984 *has_errors = err;
3985 return max_char;
3986}
3987
3988/* Similar to PyUnicode_WRITE but can also write into wstr field
3989 of the legacy unicode representation */
3990#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3991 do { \
3992 const int k_ = (kind); \
3993 if (k_ == PyUnicode_WCHAR_KIND) \
3994 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3995 else if (k_ == PyUnicode_1BYTE_KIND) \
3996 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3997 else if (k_ == PyUnicode_2BYTE_KIND) \
3998 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3999 else \
4000 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4001 } while (0)
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003PyObject *
4004PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 Py_ssize_t size,
4006 const char *errors,
4007 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004008{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004011 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004012 Py_ssize_t startinpos;
4013 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004014 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004016 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 PyObject *errorHandler = NULL;
4018 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_UCS4 maxchar = 0;
4020 Py_ssize_t unicode_size;
4021 Py_ssize_t i;
4022 int kind;
4023 void *data;
4024 int has_errors;
4025 Py_UNICODE *error_outptr;
4026#if SIZEOF_WCHAR_T == 2
4027 Py_ssize_t wchar_offset = 0;
4028#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029
Walter Dörwald69652032004-09-07 20:24:22 +00004030 if (size == 0) {
4031 if (consumed)
4032 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4036 consumed, &has_errors);
4037 if (has_errors) {
4038 unicode = _PyUnicode_New(size);
4039 if (!unicode)
4040 return NULL;
4041 kind = PyUnicode_WCHAR_KIND;
4042 data = PyUnicode_AS_UNICODE(unicode);
4043 assert(data != NULL);
4044 }
4045 else {
4046 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4047 if (!unicode)
4048 return NULL;
4049 /* When the string is ASCII only, just use memcpy and return.
4050 unicode_size may be != size if there is an incomplete UTF-8
4051 sequence at the end of the ASCII block. */
4052 if (maxchar < 128 && size == unicode_size) {
4053 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4054 return (PyObject *)unicode;
4055 }
4056 kind = PyUnicode_KIND(unicode);
4057 data = PyUnicode_DATA(unicode);
4058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004062 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063
4064 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004065 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066
4067 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004068 /* Fast path for runs of ASCII characters. Given that common UTF-8
4069 input will consist of an overwhelming majority of ASCII
4070 characters, we try to optimize for this case by checking
4071 as many characters as a C 'long' can contain.
4072 First, check if we can do an aligned read, as most CPUs have
4073 a penalty for unaligned reads.
4074 */
4075 if (!((size_t) s & LONG_PTR_MASK)) {
4076 /* Help register allocation */
4077 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004079 while (_s < aligned_end) {
4080 /* Read a whole long at a time (either 4 or 8 bytes),
4081 and do a fast unrolled copy if it only contains ASCII
4082 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 unsigned long value = *(unsigned long *) _s;
4084 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004085 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4087 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4088 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4089 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004090#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4092 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4093 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4094 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004095#endif
4096 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004098 }
4099 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004101 if (s == e)
4102 break;
4103 ch = (unsigned char)*s;
4104 }
4105 }
4106
4107 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 s++;
4110 continue;
4111 }
4112
4113 n = utf8_code_length[ch];
4114
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004115 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 if (consumed)
4117 break;
4118 else {
4119 errmsg = "unexpected end of data";
4120 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004121 endinpos = startinpos+1;
4122 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4123 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 goto utf8Error;
4125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004126 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
4128 switch (n) {
4129
4130 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004131 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 startinpos = s-starts;
4133 endinpos = startinpos+1;
4134 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
4136 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004137 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 startinpos = s-starts;
4139 endinpos = startinpos+1;
4140 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141
4142 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004143 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004144 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004146 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 goto utf8Error;
4148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004150 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 break;
4153
4154 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004155 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4156 will result in surrogates in range d800-dfff. Surrogates are
4157 not valid UTF-8 so they are rejected.
4158 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4159 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004160 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004161 (s[2] & 0xc0) != 0x80 ||
4162 ((unsigned char)s[0] == 0xE0 &&
4163 (unsigned char)s[1] < 0xA0) ||
4164 ((unsigned char)s[0] == 0xED &&
4165 (unsigned char)s[1] > 0x9F)) {
4166 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004168 endinpos = startinpos + 1;
4169
4170 /* if s[1] first two bits are 1 and 0, then the invalid
4171 continuation byte is s[2], so increment endinpos by 1,
4172 if not, s[1] is invalid and endinpos doesn't need to
4173 be incremented. */
4174 if ((s[1] & 0xC0) == 0x80)
4175 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 goto utf8Error;
4177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004181 break;
4182
4183 case 4:
4184 if ((s[1] & 0xc0) != 0x80 ||
4185 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004186 (s[3] & 0xc0) != 0x80 ||
4187 ((unsigned char)s[0] == 0xF0 &&
4188 (unsigned char)s[1] < 0x90) ||
4189 ((unsigned char)s[0] == 0xF4 &&
4190 (unsigned char)s[1] > 0x8F)) {
4191 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004193 endinpos = startinpos + 1;
4194 if ((s[1] & 0xC0) == 0x80) {
4195 endinpos++;
4196 if ((s[2] & 0xC0) == 0x80)
4197 endinpos++;
4198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 goto utf8Error;
4200 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004201 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004202 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4203 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 /* If the string is flexible or we have native UCS-4, write
4206 directly.. */
4207 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4208 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 else {
4211 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 /* translate from 10000..10FFFF to 0..FFFF */
4214 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004216 /* high surrogate = top 10 bits added to D800 */
4217 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4218 (Py_UNICODE)(0xD800 + (ch >> 10)));
4219
4220 /* low surrogate = bottom 10 bits added to DC00 */
4221 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4222 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4223 }
4224#if SIZEOF_WCHAR_T == 2
4225 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004226#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 }
4229 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004231
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 /* If this is not yet a resizable string, make it one.. */
4234 if (kind != PyUnicode_WCHAR_KIND) {
4235 const Py_UNICODE *u;
4236 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4237 if (!new_unicode)
4238 goto onError;
4239 u = PyUnicode_AsUnicode((PyObject *)unicode);
4240 if (!u)
4241 goto onError;
4242#if SIZEOF_WCHAR_T == 2
4243 i += wchar_offset;
4244#endif
4245 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4246 Py_DECREF(unicode);
4247 unicode = new_unicode;
4248 kind = 0;
4249 data = PyUnicode_AS_UNICODE(new_unicode);
4250 assert(data != NULL);
4251 }
4252 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 if (unicode_decode_call_errorhandler(
4254 errors, &errorHandler,
4255 "utf8", errmsg,
4256 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004257 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259 /* Update data because unicode_decode_call_errorhandler might have
4260 re-created or resized the unicode object. */
4261 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004264 /* Ensure the unicode_size calculation above was correct: */
4265 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4266
Walter Dörwald69652032004-09-07 20:24:22 +00004267 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270 /* Adjust length and ready string when it contained errors and
4271 is of the old resizable kind. */
4272 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004273 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 goto onError;
4275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 Py_XDECREF(errorHandler);
4278 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004279#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004280 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 Py_DECREF(unicode);
4282 return NULL;
4283 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004284#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return (PyObject *)unicode;
4286
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 Py_XDECREF(errorHandler);
4289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 Py_DECREF(unicode);
4291 return NULL;
4292}
4293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004295
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004296#ifdef __APPLE__
4297
4298/* Simplified UTF-8 decoder using surrogateescape error handler,
4299 used to decode the command line arguments on Mac OS X. */
4300
4301wchar_t*
4302_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4303{
4304 int n;
4305 const char *e;
4306 wchar_t *unicode, *p;
4307
4308 /* Note: size will always be longer than the resulting Unicode
4309 character count */
4310 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4311 PyErr_NoMemory();
4312 return NULL;
4313 }
4314 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4315 if (!unicode)
4316 return NULL;
4317
4318 /* Unpack UTF-8 encoded data */
4319 p = unicode;
4320 e = s + size;
4321 while (s < e) {
4322 Py_UCS4 ch = (unsigned char)*s;
4323
4324 if (ch < 0x80) {
4325 *p++ = (wchar_t)ch;
4326 s++;
4327 continue;
4328 }
4329
4330 n = utf8_code_length[ch];
4331 if (s + n > e) {
4332 goto surrogateescape;
4333 }
4334
4335 switch (n) {
4336 case 0:
4337 case 1:
4338 goto surrogateescape;
4339
4340 case 2:
4341 if ((s[1] & 0xc0) != 0x80)
4342 goto surrogateescape;
4343 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4344 assert ((ch > 0x007F) && (ch <= 0x07FF));
4345 *p++ = (wchar_t)ch;
4346 break;
4347
4348 case 3:
4349 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4350 will result in surrogates in range d800-dfff. Surrogates are
4351 not valid UTF-8 so they are rejected.
4352 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4353 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4354 if ((s[1] & 0xc0) != 0x80 ||
4355 (s[2] & 0xc0) != 0x80 ||
4356 ((unsigned char)s[0] == 0xE0 &&
4357 (unsigned char)s[1] < 0xA0) ||
4358 ((unsigned char)s[0] == 0xED &&
4359 (unsigned char)s[1] > 0x9F)) {
4360
4361 goto surrogateescape;
4362 }
4363 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4364 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004365 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004366 break;
4367
4368 case 4:
4369 if ((s[1] & 0xc0) != 0x80 ||
4370 (s[2] & 0xc0) != 0x80 ||
4371 (s[3] & 0xc0) != 0x80 ||
4372 ((unsigned char)s[0] == 0xF0 &&
4373 (unsigned char)s[1] < 0x90) ||
4374 ((unsigned char)s[0] == 0xF4 &&
4375 (unsigned char)s[1] > 0x8F)) {
4376 goto surrogateescape;
4377 }
4378 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4379 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4380 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4381
4382#if SIZEOF_WCHAR_T == 4
4383 *p++ = (wchar_t)ch;
4384#else
4385 /* compute and append the two surrogates: */
4386
4387 /* translate from 10000..10FFFF to 0..FFFF */
4388 ch -= 0x10000;
4389
4390 /* high surrogate = top 10 bits added to D800 */
4391 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4392
4393 /* low surrogate = bottom 10 bits added to DC00 */
4394 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4395#endif
4396 break;
4397 }
4398 s += n;
4399 continue;
4400
4401 surrogateescape:
4402 *p++ = 0xDC00 + ch;
4403 s++;
4404 }
4405 *p = L'\0';
4406 return unicode;
4407}
4408
4409#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004411/* Primary internal function which creates utf8 encoded bytes objects.
4412
4413 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004414 and allocate exactly as much space needed at the end. Else allocate the
4415 maximum possible needed (4 result bytes per Unicode character), and return
4416 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004417*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004418PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004419_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420{
Tim Peters602f7402002-04-27 18:03:26 +00004421#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004422
Guido van Rossum98297ee2007-11-06 21:34:58 +00004423 Py_ssize_t i; /* index into s of next input byte */
4424 PyObject *result; /* result string object */
4425 char *p; /* next free byte in output buffer */
4426 Py_ssize_t nallocated; /* number of result bytes allocated */
4427 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004428 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004429 PyObject *errorHandler = NULL;
4430 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004431 int kind;
4432 void *data;
4433 Py_ssize_t size;
4434 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4435#if SIZEOF_WCHAR_T == 2
4436 Py_ssize_t wchar_offset = 0;
4437#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004439 if (!PyUnicode_Check(unicode)) {
4440 PyErr_BadArgument();
4441 return NULL;
4442 }
4443
4444 if (PyUnicode_READY(unicode) == -1)
4445 return NULL;
4446
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004447 if (PyUnicode_UTF8(unicode))
4448 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4449 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450
4451 kind = PyUnicode_KIND(unicode);
4452 data = PyUnicode_DATA(unicode);
4453 size = PyUnicode_GET_LENGTH(unicode);
4454
Tim Peters602f7402002-04-27 18:03:26 +00004455 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456
Tim Peters602f7402002-04-27 18:03:26 +00004457 if (size <= MAX_SHORT_UNICHARS) {
4458 /* Write into the stack buffer; nallocated can't overflow.
4459 * At the end, we'll allocate exactly as much heap space as it
4460 * turns out we need.
4461 */
4462 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004463 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004464 p = stackbuf;
4465 }
4466 else {
4467 /* Overallocate on the heap, and give the excess back at the end. */
4468 nallocated = size * 4;
4469 if (nallocated / 4 != size) /* overflow! */
4470 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004471 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004472 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004473 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004474 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004475 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004476
Tim Peters602f7402002-04-27 18:03:26 +00004477 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004479
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004480 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004481 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004485 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004486 *p++ = (char)(0xc0 | (ch >> 6));
4487 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004488 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004489 Py_ssize_t newpos;
4490 PyObject *rep;
4491 Py_ssize_t repsize, k, startpos;
4492 startpos = i-1;
4493#if SIZEOF_WCHAR_T == 2
4494 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004495#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004496 rep = unicode_encode_call_errorhandler(
4497 errors, &errorHandler, "utf-8", "surrogates not allowed",
4498 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4499 &exc, startpos, startpos+1, &newpos);
4500 if (!rep)
4501 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004503 if (PyBytes_Check(rep))
4504 repsize = PyBytes_GET_SIZE(rep);
4505 else
4506 repsize = PyUnicode_GET_SIZE(rep);
4507
4508 if (repsize > 4) {
4509 Py_ssize_t offset;
4510
4511 if (result == NULL)
4512 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004513 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004516 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4517 /* integer overflow */
4518 PyErr_NoMemory();
4519 goto error;
4520 }
4521 nallocated += repsize - 4;
4522 if (result != NULL) {
4523 if (_PyBytes_Resize(&result, nallocated) < 0)
4524 goto error;
4525 } else {
4526 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004527 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 goto error;
4529 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4530 }
4531 p = PyBytes_AS_STRING(result) + offset;
4532 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 if (PyBytes_Check(rep)) {
4535 char *prep = PyBytes_AS_STRING(rep);
4536 for(k = repsize; k > 0; k--)
4537 *p++ = *prep++;
4538 } else /* rep is unicode */ {
4539 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4540 Py_UNICODE c;
4541
4542 for(k=0; k<repsize; k++) {
4543 c = prep[k];
4544 if (0x80 <= c) {
4545 raise_encode_exception(&exc, "utf-8",
4546 PyUnicode_AS_UNICODE(unicode),
4547 size, i-1, i,
4548 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004549 goto error;
4550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004551 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004552 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004554 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004555 } else if (ch < 0x10000) {
4556 *p++ = (char)(0xe0 | (ch >> 12));
4557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4558 *p++ = (char)(0x80 | (ch & 0x3f));
4559 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004560 /* Encode UCS4 Unicode ordinals */
4561 *p++ = (char)(0xf0 | (ch >> 18));
4562 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4563 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4564 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004565#if SIZEOF_WCHAR_T == 2
4566 wchar_offset++;
4567#endif
Tim Peters602f7402002-04-27 18:03:26 +00004568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004570
Guido van Rossum98297ee2007-11-06 21:34:58 +00004571 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004572 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004573 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004574 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004575 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004576 }
4577 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004578 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004579 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004580 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004581 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004584 Py_XDECREF(errorHandler);
4585 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004586 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004587 error:
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 Py_XDECREF(result);
4591 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004592
Tim Peters602f7402002-04-27 18:03:26 +00004593#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594}
4595
Alexander Belopolsky40018472011-02-26 01:02:56 +00004596PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004597PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4598 Py_ssize_t size,
4599 const char *errors)
4600{
4601 PyObject *v, *unicode;
4602
4603 unicode = PyUnicode_FromUnicode(s, size);
4604 if (unicode == NULL)
4605 return NULL;
4606 v = _PyUnicode_AsUTF8String(unicode, errors);
4607 Py_DECREF(unicode);
4608 return v;
4609}
4610
4611PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004614 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615}
4616
Walter Dörwald41980ca2007-08-16 21:55:45 +00004617/* --- UTF-32 Codec ------------------------------------------------------- */
4618
4619PyObject *
4620PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 Py_ssize_t size,
4622 const char *errors,
4623 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004624{
4625 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4626}
4627
4628PyObject *
4629PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 Py_ssize_t size,
4631 const char *errors,
4632 int *byteorder,
4633 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004634{
4635 const char *starts = s;
4636 Py_ssize_t startinpos;
4637 Py_ssize_t endinpos;
4638 Py_ssize_t outpos;
4639 PyUnicodeObject *unicode;
4640 Py_UNICODE *p;
4641#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004642 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004643 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004644#else
4645 const int pairs = 0;
4646#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004647 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648 int bo = 0; /* assume native ordering by default */
4649 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004650 /* Offsets from q for retrieving bytes in the right order. */
4651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4652 int iorder[] = {0, 1, 2, 3};
4653#else
4654 int iorder[] = {3, 2, 1, 0};
4655#endif
4656 PyObject *errorHandler = NULL;
4657 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004658
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659 q = (unsigned char *)s;
4660 e = q + size;
4661
4662 if (byteorder)
4663 bo = *byteorder;
4664
4665 /* Check for BOM marks (U+FEFF) in the input and adjust current
4666 byte order setting accordingly. In native mode, the leading BOM
4667 mark is skipped, in all other modes, it is copied to the output
4668 stream as-is (giving a ZWNBSP character). */
4669 if (bo == 0) {
4670 if (size >= 4) {
4671 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 if (bom == 0x0000FEFF) {
4675 q += 4;
4676 bo = -1;
4677 }
4678 else if (bom == 0xFFFE0000) {
4679 q += 4;
4680 bo = 1;
4681 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004682#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 if (bom == 0x0000FEFF) {
4684 q += 4;
4685 bo = 1;
4686 }
4687 else if (bom == 0xFFFE0000) {
4688 q += 4;
4689 bo = -1;
4690 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693 }
4694
4695 if (bo == -1) {
4696 /* force LE */
4697 iorder[0] = 0;
4698 iorder[1] = 1;
4699 iorder[2] = 2;
4700 iorder[3] = 3;
4701 }
4702 else if (bo == 1) {
4703 /* force BE */
4704 iorder[0] = 3;
4705 iorder[1] = 2;
4706 iorder[2] = 1;
4707 iorder[3] = 0;
4708 }
4709
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004710 /* On narrow builds we split characters outside the BMP into two
4711 codepoints => count how much extra space we need. */
4712#ifndef Py_UNICODE_WIDE
4713 for (qq = q; qq < e; qq += 4)
4714 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4715 pairs++;
4716#endif
4717
4718 /* This might be one to much, because of a BOM */
4719 unicode = _PyUnicode_New((size+3)/4+pairs);
4720 if (!unicode)
4721 return NULL;
4722 if (size == 0)
4723 return (PyObject *)unicode;
4724
4725 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004726 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004727
Walter Dörwald41980ca2007-08-16 21:55:45 +00004728 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 Py_UCS4 ch;
4730 /* remaining bytes at the end? (size should be divisible by 4) */
4731 if (e-q<4) {
4732 if (consumed)
4733 break;
4734 errmsg = "truncated data";
4735 startinpos = ((const char *)q)-starts;
4736 endinpos = ((const char *)e)-starts;
4737 goto utf32Error;
4738 /* The remaining input chars are ignored if the callback
4739 chooses to skip the input */
4740 }
4741 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4742 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004743
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 if (ch >= 0x110000)
4745 {
4746 errmsg = "codepoint not in range(0x110000)";
4747 startinpos = ((const char *)q)-starts;
4748 endinpos = startinpos+4;
4749 goto utf32Error;
4750 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004751#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 if (ch >= 0x10000)
4753 {
4754 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4755 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4756 }
4757 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004758#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 *p++ = ch;
4760 q += 4;
4761 continue;
4762 utf32Error:
4763 outpos = p-PyUnicode_AS_UNICODE(unicode);
4764 if (unicode_decode_call_errorhandler(
4765 errors, &errorHandler,
4766 "utf32", errmsg,
4767 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4768 &unicode, &outpos, &p))
4769 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004770 }
4771
4772 if (byteorder)
4773 *byteorder = bo;
4774
4775 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004777
4778 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004779 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004780 goto onError;
4781
4782 Py_XDECREF(errorHandler);
4783 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004784#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004785 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 Py_DECREF(unicode);
4787 return NULL;
4788 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004789#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004790 return (PyObject *)unicode;
4791
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004793 Py_DECREF(unicode);
4794 Py_XDECREF(errorHandler);
4795 Py_XDECREF(exc);
4796 return NULL;
4797}
4798
4799PyObject *
4800PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004801 Py_ssize_t size,
4802 const char *errors,
4803 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004804{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004805 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004807 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004809 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810#else
4811 const int pairs = 0;
4812#endif
4813 /* Offsets from p for storing byte pairs in the right order. */
4814#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4815 int iorder[] = {0, 1, 2, 3};
4816#else
4817 int iorder[] = {3, 2, 1, 0};
4818#endif
4819
Benjamin Peterson29060642009-01-31 22:14:21 +00004820#define STORECHAR(CH) \
4821 do { \
4822 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4823 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4824 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4825 p[iorder[0]] = (CH) & 0xff; \
4826 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004827 } while(0)
4828
4829 /* In narrow builds we can output surrogate pairs as one codepoint,
4830 so we need less space. */
4831#ifndef Py_UNICODE_WIDE
4832 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4834 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4835 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004837 nsize = (size - pairs + (byteorder == 0));
4838 bytesize = nsize * 4;
4839 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004841 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004842 if (v == NULL)
4843 return NULL;
4844
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004845 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004846 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004849 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850
4851 if (byteorder == -1) {
4852 /* force LE */
4853 iorder[0] = 0;
4854 iorder[1] = 1;
4855 iorder[2] = 2;
4856 iorder[3] = 3;
4857 }
4858 else if (byteorder == 1) {
4859 /* force BE */
4860 iorder[0] = 3;
4861 iorder[1] = 2;
4862 iorder[2] = 1;
4863 iorder[3] = 0;
4864 }
4865
4866 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4870 Py_UCS4 ch2 = *s;
4871 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4872 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4873 s++;
4874 size--;
4875 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004876 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004877#endif
4878 STORECHAR(ch);
4879 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004880
4881 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004882 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883#undef STORECHAR
4884}
4885
Alexander Belopolsky40018472011-02-26 01:02:56 +00004886PyObject *
4887PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004888{
4889 if (!PyUnicode_Check(unicode)) {
4890 PyErr_BadArgument();
4891 return NULL;
4892 }
4893 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 PyUnicode_GET_SIZE(unicode),
4895 NULL,
4896 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897}
4898
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899/* --- UTF-16 Codec ------------------------------------------------------- */
4900
Tim Peters772747b2001-08-09 22:21:55 +00004901PyObject *
4902PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 Py_ssize_t size,
4904 const char *errors,
4905 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906{
Walter Dörwald69652032004-09-07 20:24:22 +00004907 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4908}
4909
Antoine Pitrouab868312009-01-10 15:40:25 +00004910/* Two masks for fast checking of whether a C 'long' may contain
4911 UTF16-encoded surrogate characters. This is an efficient heuristic,
4912 assuming that non-surrogate characters with a code point >= 0x8000 are
4913 rare in most input.
4914 FAST_CHAR_MASK is used when the input is in native byte ordering,
4915 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004916*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004917#if (SIZEOF_LONG == 8)
4918# define FAST_CHAR_MASK 0x8000800080008000L
4919# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4920#elif (SIZEOF_LONG == 4)
4921# define FAST_CHAR_MASK 0x80008000L
4922# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4923#else
4924# error C 'long' size should be either 4 or 8!
4925#endif
4926
Walter Dörwald69652032004-09-07 20:24:22 +00004927PyObject *
4928PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 Py_ssize_t size,
4930 const char *errors,
4931 int *byteorder,
4932 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004933{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 Py_ssize_t startinpos;
4936 Py_ssize_t endinpos;
4937 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 PyUnicodeObject *unicode;
4939 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004940 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004941 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004942 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004943 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004944 /* Offsets from q for retrieving byte pairs in the right order. */
4945#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4946 int ihi = 1, ilo = 0;
4947#else
4948 int ihi = 0, ilo = 1;
4949#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 PyObject *errorHandler = NULL;
4951 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952
4953 /* Note: size will always be longer than the resulting Unicode
4954 character count */
4955 unicode = _PyUnicode_New(size);
4956 if (!unicode)
4957 return NULL;
4958 if (size == 0)
4959 return (PyObject *)unicode;
4960
4961 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004962 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004963 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004964 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965
4966 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004967 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004969 /* Check for BOM marks (U+FEFF) in the input and adjust current
4970 byte order setting accordingly. In native mode, the leading BOM
4971 mark is skipped, in all other modes, it is copied to the output
4972 stream as-is (giving a ZWNBSP character). */
4973 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004974 if (size >= 2) {
4975 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004976#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 if (bom == 0xFEFF) {
4978 q += 2;
4979 bo = -1;
4980 }
4981 else if (bom == 0xFFFE) {
4982 q += 2;
4983 bo = 1;
4984 }
Tim Petersced69f82003-09-16 20:30:58 +00004985#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 if (bom == 0xFEFF) {
4987 q += 2;
4988 bo = 1;
4989 }
4990 else if (bom == 0xFFFE) {
4991 q += 2;
4992 bo = -1;
4993 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004994#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997
Tim Peters772747b2001-08-09 22:21:55 +00004998 if (bo == -1) {
4999 /* force LE */
5000 ihi = 1;
5001 ilo = 0;
5002 }
5003 else if (bo == 1) {
5004 /* force BE */
5005 ihi = 0;
5006 ilo = 1;
5007 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5009 native_ordering = ilo < ihi;
5010#else
5011 native_ordering = ilo > ihi;
5012#endif
Tim Peters772747b2001-08-09 22:21:55 +00005013
Antoine Pitrouab868312009-01-10 15:40:25 +00005014 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005015 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005017 /* First check for possible aligned read of a C 'long'. Unaligned
5018 reads are more expensive, better to defer to another iteration. */
5019 if (!((size_t) q & LONG_PTR_MASK)) {
5020 /* Fast path for runs of non-surrogate chars. */
5021 register const unsigned char *_q = q;
5022 Py_UNICODE *_p = p;
5023 if (native_ordering) {
5024 /* Native ordering is simple: as long as the input cannot
5025 possibly contain a surrogate char, do an unrolled copy
5026 of several 16-bit code points to the target object.
5027 The non-surrogate check is done on several input bytes
5028 at a time (as many as a C 'long' can contain). */
5029 while (_q < aligned_end) {
5030 unsigned long data = * (unsigned long *) _q;
5031 if (data & FAST_CHAR_MASK)
5032 break;
5033 _p[0] = ((unsigned short *) _q)[0];
5034 _p[1] = ((unsigned short *) _q)[1];
5035#if (SIZEOF_LONG == 8)
5036 _p[2] = ((unsigned short *) _q)[2];
5037 _p[3] = ((unsigned short *) _q)[3];
5038#endif
5039 _q += SIZEOF_LONG;
5040 _p += SIZEOF_LONG / 2;
5041 }
5042 }
5043 else {
5044 /* Byteswapped ordering is similar, but we must decompose
5045 the copy bytewise, and take care of zero'ing out the
5046 upper bytes if the target object is in 32-bit units
5047 (that is, in UCS-4 builds). */
5048 while (_q < aligned_end) {
5049 unsigned long data = * (unsigned long *) _q;
5050 if (data & SWAPPED_FAST_CHAR_MASK)
5051 break;
5052 /* Zero upper bytes in UCS-4 builds */
5053#if (Py_UNICODE_SIZE > 2)
5054 _p[0] = 0;
5055 _p[1] = 0;
5056#if (SIZEOF_LONG == 8)
5057 _p[2] = 0;
5058 _p[3] = 0;
5059#endif
5060#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005061 /* Issue #4916; UCS-4 builds on big endian machines must
5062 fill the two last bytes of each 4-byte unit. */
5063#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5064# define OFF 2
5065#else
5066# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005067#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005068 ((unsigned char *) _p)[OFF + 1] = _q[0];
5069 ((unsigned char *) _p)[OFF + 0] = _q[1];
5070 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5071 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5072#if (SIZEOF_LONG == 8)
5073 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5074 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5075 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5076 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5077#endif
5078#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005079 _q += SIZEOF_LONG;
5080 _p += SIZEOF_LONG / 2;
5081 }
5082 }
5083 p = _p;
5084 q = _q;
5085 if (q >= e)
5086 break;
5087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089
Benjamin Peterson14339b62009-01-31 16:36:08 +00005090 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005091
5092 if (ch < 0xD800 || ch > 0xDFFF) {
5093 *p++ = ch;
5094 continue;
5095 }
5096
5097 /* UTF-16 code pair: */
5098 if (q > e) {
5099 errmsg = "unexpected end of data";
5100 startinpos = (((const char *)q) - 2) - starts;
5101 endinpos = ((const char *)e) + 1 - starts;
5102 goto utf16Error;
5103 }
5104 if (0xD800 <= ch && ch <= 0xDBFF) {
5105 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5106 q += 2;
5107 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005108#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 *p++ = ch;
5110 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005111#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005113#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 continue;
5115 }
5116 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005117 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 startinpos = (((const char *)q)-4)-starts;
5119 endinpos = startinpos+2;
5120 goto utf16Error;
5121 }
5122
Benjamin Peterson14339b62009-01-31 16:36:08 +00005123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 errmsg = "illegal encoding";
5125 startinpos = (((const char *)q)-2)-starts;
5126 endinpos = startinpos+2;
5127 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005128
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 utf16Error:
5130 outpos = p - PyUnicode_AS_UNICODE(unicode);
5131 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005132 errors,
5133 &errorHandler,
5134 "utf16", errmsg,
5135 &starts,
5136 (const char **)&e,
5137 &startinpos,
5138 &endinpos,
5139 &exc,
5140 (const char **)&q,
5141 &unicode,
5142 &outpos,
5143 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005146 /* remaining byte at the end? (size should be even) */
5147 if (e == q) {
5148 if (!consumed) {
5149 errmsg = "truncated data";
5150 startinpos = ((const char *)q) - starts;
5151 endinpos = ((const char *)e) + 1 - starts;
5152 outpos = p - PyUnicode_AS_UNICODE(unicode);
5153 if (unicode_decode_call_errorhandler(
5154 errors,
5155 &errorHandler,
5156 "utf16", errmsg,
5157 &starts,
5158 (const char **)&e,
5159 &startinpos,
5160 &endinpos,
5161 &exc,
5162 (const char **)&q,
5163 &unicode,
5164 &outpos,
5165 &p))
5166 goto onError;
5167 /* The remaining input chars are ignored if the callback
5168 chooses to skip the input */
5169 }
5170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171
5172 if (byteorder)
5173 *byteorder = bo;
5174
Walter Dörwald69652032004-09-07 20:24:22 +00005175 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005179 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 goto onError;
5181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 Py_XDECREF(errorHandler);
5183 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005184#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005185 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186 Py_DECREF(unicode);
5187 return NULL;
5188 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005189#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 return (PyObject *)unicode;
5191
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 Py_XDECREF(errorHandler);
5195 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 return NULL;
5197}
5198
Antoine Pitrouab868312009-01-10 15:40:25 +00005199#undef FAST_CHAR_MASK
5200#undef SWAPPED_FAST_CHAR_MASK
5201
Tim Peters772747b2001-08-09 22:21:55 +00005202PyObject *
5203PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 Py_ssize_t size,
5205 const char *errors,
5206 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005208 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005209 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005210 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005211#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005212 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005213#else
5214 const int pairs = 0;
5215#endif
Tim Peters772747b2001-08-09 22:21:55 +00005216 /* Offsets from p for storing byte pairs in the right order. */
5217#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5218 int ihi = 1, ilo = 0;
5219#else
5220 int ihi = 0, ilo = 1;
5221#endif
5222
Benjamin Peterson29060642009-01-31 22:14:21 +00005223#define STORECHAR(CH) \
5224 do { \
5225 p[ihi] = ((CH) >> 8) & 0xff; \
5226 p[ilo] = (CH) & 0xff; \
5227 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005228 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005230#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005231 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (s[i] >= 0x10000)
5233 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005234#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005235 /* 2 * (size + pairs + (byteorder == 0)) */
5236 if (size > PY_SSIZE_T_MAX ||
5237 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005239 nsize = size + pairs + (byteorder == 0);
5240 bytesize = nsize * 2;
5241 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005243 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 if (v == NULL)
5245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005250 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005251 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005252
5253 if (byteorder == -1) {
5254 /* force LE */
5255 ihi = 1;
5256 ilo = 0;
5257 }
5258 else if (byteorder == 1) {
5259 /* force BE */
5260 ihi = 0;
5261 ilo = 1;
5262 }
5263
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005264 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 Py_UNICODE ch = *s++;
5266 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005267#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 if (ch >= 0x10000) {
5269 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5270 ch = 0xD800 | ((ch-0x10000) >> 10);
5271 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005272#endif
Tim Peters772747b2001-08-09 22:21:55 +00005273 STORECHAR(ch);
5274 if (ch2)
5275 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005276 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005277
5278 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005279 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005280#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281}
5282
Alexander Belopolsky40018472011-02-26 01:02:56 +00005283PyObject *
5284PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
5286 if (!PyUnicode_Check(unicode)) {
5287 PyErr_BadArgument();
5288 return NULL;
5289 }
5290 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 PyUnicode_GET_SIZE(unicode),
5292 NULL,
5293 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294}
5295
5296/* --- Unicode Escape Codec ----------------------------------------------- */
5297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005298/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5299 if all the escapes in the string make it still a valid ASCII string.
5300 Returns -1 if any escapes were found which cause the string to
5301 pop out of ASCII range. Otherwise returns the length of the
5302 required buffer to hold the string.
5303 */
5304Py_ssize_t
5305length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5306{
5307 const unsigned char *p = (const unsigned char *)s;
5308 const unsigned char *end = p + size;
5309 Py_ssize_t length = 0;
5310
5311 if (size < 0)
5312 return -1;
5313
5314 for (; p < end; ++p) {
5315 if (*p > 127) {
5316 /* Non-ASCII */
5317 return -1;
5318 }
5319 else if (*p != '\\') {
5320 /* Normal character */
5321 ++length;
5322 }
5323 else {
5324 /* Backslash-escape, check next char */
5325 ++p;
5326 /* Escape sequence reaches till end of string or
5327 non-ASCII follow-up. */
5328 if (p >= end || *p > 127)
5329 return -1;
5330 switch (*p) {
5331 case '\n':
5332 /* backslash + \n result in zero characters */
5333 break;
5334 case '\\': case '\'': case '\"':
5335 case 'b': case 'f': case 't':
5336 case 'n': case 'r': case 'v': case 'a':
5337 ++length;
5338 break;
5339 case '0': case '1': case '2': case '3':
5340 case '4': case '5': case '6': case '7':
5341 case 'x': case 'u': case 'U': case 'N':
5342 /* these do not guarantee ASCII characters */
5343 return -1;
5344 default:
5345 /* count the backslash + the other character */
5346 length += 2;
5347 }
5348 }
5349 }
5350 return length;
5351}
5352
5353/* Similar to PyUnicode_WRITE but either write into wstr field
5354 or treat string as ASCII. */
5355#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5356 do { \
5357 if ((kind) != PyUnicode_WCHAR_KIND) \
5358 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5359 else \
5360 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5361 } while (0)
5362
5363#define WRITE_WSTR(buf, index, value) \
5364 assert(kind == PyUnicode_WCHAR_KIND), \
5365 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5366
5367
Fredrik Lundh06d12682001-01-24 07:59:11 +00005368static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005369
Alexander Belopolsky40018472011-02-26 01:02:56 +00005370PyObject *
5371PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005372 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 Py_ssize_t startinpos;
5377 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005382 char* message;
5383 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 PyObject *errorHandler = NULL;
5385 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 Py_ssize_t ascii_length;
5387 Py_ssize_t i;
5388 int kind;
5389 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005391 ascii_length = length_of_escaped_ascii_string(s, size);
5392
5393 /* After length_of_escaped_ascii_string() there are two alternatives,
5394 either the string is pure ASCII with named escapes like \n, etc.
5395 and we determined it's exact size (common case)
5396 or it contains \x, \u, ... escape sequences. then we create a
5397 legacy wchar string and resize it at the end of this function. */
5398 if (ascii_length >= 0) {
5399 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5400 if (!v)
5401 goto onError;
5402 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5403 kind = PyUnicode_1BYTE_KIND;
5404 data = PyUnicode_DATA(v);
5405 }
5406 else {
5407 /* Escaped strings will always be longer than the resulting
5408 Unicode string, so we start with size here and then reduce the
5409 length after conversion to the true value.
5410 (but if the error callback returns a long replacement string
5411 we'll have to allocate more space) */
5412 v = _PyUnicode_New(size);
5413 if (!v)
5414 goto onError;
5415 kind = PyUnicode_WCHAR_KIND;
5416 data = PyUnicode_AS_UNICODE(v);
5417 }
5418
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 if (size == 0)
5420 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005421 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 while (s < end) {
5425 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005426 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005429 if (kind == PyUnicode_WCHAR_KIND) {
5430 assert(i < _PyUnicode_WSTR_LENGTH(v));
5431 }
5432 else {
5433 /* The only case in which i == ascii_length is a backslash
5434 followed by a newline. */
5435 assert(i <= ascii_length);
5436 }
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 /* Non-escape characters are interpreted as Unicode ordinals */
5439 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 continue;
5442 }
5443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 /* \ - Escapes */
5446 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005447 c = *s++;
5448 if (s > end)
5449 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450
5451 if (kind == PyUnicode_WCHAR_KIND) {
5452 assert(i < _PyUnicode_WSTR_LENGTH(v));
5453 }
5454 else {
5455 /* The only case in which i == ascii_length is a backslash
5456 followed by a newline. */
5457 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5458 }
5459
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005460 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5465 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5466 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5467 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5468 /* FF */
5469 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5470 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5471 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5472 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5473 /* VT */
5474 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5475 /* BEL, not classic C */
5476 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 case '0': case '1': case '2': case '3':
5480 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005481 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005482 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005483 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005484 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005485 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005487 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 break;
5489
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 /* hex escapes */
5491 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005493 digits = 2;
5494 message = "truncated \\xXX escape";
5495 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005499 digits = 4;
5500 message = "truncated \\uXXXX escape";
5501 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005504 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005505 digits = 8;
5506 message = "truncated \\UXXXXXXXX escape";
5507 hexescape:
5508 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 if (s+digits>end) {
5511 endinpos = size;
5512 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 errors, &errorHandler,
5514 "unicodeescape", "end of string in escape sequence",
5515 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 goto nextByte;
5520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 for (j = 0; j < digits; ++j) {
5522 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005523 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005524 endinpos = (s+j+1)-starts;
5525 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 errors, &errorHandler,
5528 "unicodeescape", message,
5529 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005531 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005534 }
5535 chr = (chr<<4) & ~0xF;
5536 if (c >= '0' && c <= '9')
5537 chr += c - '0';
5538 else if (c >= 'a' && c <= 'f')
5539 chr += 10 + c - 'a';
5540 else
5541 chr += 10 + c - 'A';
5542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005544 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 /* _decoding_error will have already written into the
5546 target buffer. */
5547 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005548 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 /* when we get here, chr is a 32-bit unicode character */
5550 if (chr <= 0xffff)
5551 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005552 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005553 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005554 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005555 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005556#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005558#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005559 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005560 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5561 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005562#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005563 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 errors, &errorHandler,
5568 "unicodeescape", "illegal Unicode character",
5569 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005571 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005573 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005574 break;
5575
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005577 case 'N':
5578 message = "malformed \\N character escape";
5579 if (ucnhash_CAPI == NULL) {
5580 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5582 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 if (ucnhash_CAPI == NULL)
5584 goto ucnhashError;
5585 }
5586 if (*s == '{') {
5587 const char *start = s+1;
5588 /* look for the closing brace */
5589 while (*s != '}' && s < end)
5590 s++;
5591 if (s > start && s < end && *s == '}') {
5592 /* found a name. look it up in the unicode database */
5593 message = "unknown Unicode character name";
5594 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5596 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005597 goto store;
5598 }
5599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005601 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 errors, &errorHandler,
5604 "unicodeescape", message,
5605 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005606 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005609 break;
5610
5611 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005612 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 message = "\\ at end of string";
5615 s--;
5616 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 errors, &errorHandler,
5620 "unicodeescape", message,
5621 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005622 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005623 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005625 }
5626 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5628 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005629 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 /* Ensure the length prediction worked in case of ASCII strings */
5636 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5637
Victor Stinnerfe226c02011-10-03 03:52:20 +02005638 if (kind == PyUnicode_WCHAR_KIND)
5639 {
5640 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5641 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005642 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005643 Py_XDECREF(errorHandler);
5644 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005645#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005646 if (_PyUnicode_READY_REPLACE(&v)) {
5647 Py_DECREF(v);
5648 return NULL;
5649 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005650#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005652
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005654 PyErr_SetString(
5655 PyExc_UnicodeError,
5656 "\\N escapes not supported (can't load unicodedata module)"
5657 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005658 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 Py_XDECREF(errorHandler);
5660 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005661 return NULL;
5662
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 Py_XDECREF(errorHandler);
5666 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 return NULL;
5668}
5669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670#undef WRITE_ASCII_OR_WSTR
5671#undef WRITE_WSTR
5672
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673/* Return a Unicode-Escape string version of the Unicode object.
5674
5675 If quotes is true, the string is enclosed in u"" or u'' quotes as
5676 appropriate.
5677
5678*/
5679
Walter Dörwald79e913e2007-05-12 11:08:06 +00005680static const char *hexdigits = "0123456789abcdef";
5681
Alexander Belopolsky40018472011-02-26 01:02:56 +00005682PyObject *
5683PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005684 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005686 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005689#ifdef Py_UNICODE_WIDE
5690 const Py_ssize_t expandsize = 10;
5691#else
5692 const Py_ssize_t expandsize = 6;
5693#endif
5694
Thomas Wouters89f507f2006-12-13 04:49:30 +00005695 /* XXX(nnorwitz): rather than over-allocating, it would be
5696 better to choose a different scheme. Perhaps scan the
5697 first N-chars of the string and allocate based on that size.
5698 */
5699 /* Initial allocation is based on the longest-possible unichr
5700 escape.
5701
5702 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5703 unichr, so in this case it's the longest unichr escape. In
5704 narrow (UTF-16) builds this is five chars per source unichr
5705 since there are two unichrs in the surrogate pair, so in narrow
5706 (UTF-16) builds it's not the longest unichr escape.
5707
5708 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5709 so in the narrow (UTF-16) build case it's the longest unichr
5710 escape.
5711 */
5712
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005713 if (size == 0)
5714 return PyBytes_FromStringAndSize(NULL, 0);
5715
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005716 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005718
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005719 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 2
5721 + expandsize*size
5722 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 if (repr == NULL)
5724 return NULL;
5725
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005726 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 while (size-- > 0) {
5729 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005730
Walter Dörwald79e913e2007-05-12 11:08:06 +00005731 /* Escape backslashes */
5732 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 *p++ = '\\';
5734 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005735 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005736 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005737
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005738#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005739 /* Map 21-bit characters to '\U00xxxxxx' */
5740 else if (ch >= 0x10000) {
5741 *p++ = '\\';
5742 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005743 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5744 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5745 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5746 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5747 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5748 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5749 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5750 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005752 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005753#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5755 else if (ch >= 0xD800 && ch < 0xDC00) {
5756 Py_UNICODE ch2;
5757 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005758
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 ch2 = *s++;
5760 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005761 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5763 *p++ = '\\';
5764 *p++ = 'U';
5765 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5766 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5767 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5768 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5769 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5770 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5771 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5772 *p++ = hexdigits[ucs & 0x0000000F];
5773 continue;
5774 }
5775 /* Fall through: isolated surrogates are copied as-is */
5776 s--;
5777 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005778 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005779#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005780
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005782 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 *p++ = '\\';
5784 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005785 *p++ = hexdigits[(ch >> 12) & 0x000F];
5786 *p++ = hexdigits[(ch >> 8) & 0x000F];
5787 *p++ = hexdigits[(ch >> 4) & 0x000F];
5788 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005790
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005791 /* Map special whitespace to '\t', \n', '\r' */
5792 else if (ch == '\t') {
5793 *p++ = '\\';
5794 *p++ = 't';
5795 }
5796 else if (ch == '\n') {
5797 *p++ = '\\';
5798 *p++ = 'n';
5799 }
5800 else if (ch == '\r') {
5801 *p++ = '\\';
5802 *p++ = 'r';
5803 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005804
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005805 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005806 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005808 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005809 *p++ = hexdigits[(ch >> 4) & 0x000F];
5810 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005811 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 /* Copy everything else as-is */
5814 else
5815 *p++ = (char) ch;
5816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005818 assert(p - PyBytes_AS_STRING(repr) > 0);
5819 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5820 return NULL;
5821 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822}
5823
Alexander Belopolsky40018472011-02-26 01:02:56 +00005824PyObject *
5825PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005827 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 if (!PyUnicode_Check(unicode)) {
5829 PyErr_BadArgument();
5830 return NULL;
5831 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005832 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5833 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005834 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835}
5836
5837/* --- Raw Unicode Escape Codec ------------------------------------------- */
5838
Alexander Belopolsky40018472011-02-26 01:02:56 +00005839PyObject *
5840PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005841 Py_ssize_t size,
5842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845 Py_ssize_t startinpos;
5846 Py_ssize_t endinpos;
5847 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 const char *end;
5851 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 PyObject *errorHandler = NULL;
5853 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 /* Escaped strings will always be longer than the resulting
5856 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 length after conversion to the true value. (But decoding error
5858 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 v = _PyUnicode_New(size);
5860 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 end = s + size;
5866 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 unsigned char c;
5868 Py_UCS4 x;
5869 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005870 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 /* Non-escape characters are interpreted as Unicode ordinals */
5873 if (*s != '\\') {
5874 *p++ = (unsigned char)*s++;
5875 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 startinpos = s-starts;
5878
5879 /* \u-escapes are only interpreted iff the number of leading
5880 backslashes if odd */
5881 bs = s;
5882 for (;s < end;) {
5883 if (*s != '\\')
5884 break;
5885 *p++ = (unsigned char)*s++;
5886 }
5887 if (((s - bs) & 1) == 0 ||
5888 s >= end ||
5889 (*s != 'u' && *s != 'U')) {
5890 continue;
5891 }
5892 p--;
5893 count = *s=='u' ? 4 : 8;
5894 s++;
5895
5896 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5897 outpos = p-PyUnicode_AS_UNICODE(v);
5898 for (x = 0, i = 0; i < count; ++i, ++s) {
5899 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005900 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 endinpos = s-starts;
5902 if (unicode_decode_call_errorhandler(
5903 errors, &errorHandler,
5904 "rawunicodeescape", "truncated \\uXXXX",
5905 &starts, &end, &startinpos, &endinpos, &exc, &s,
5906 &v, &outpos, &p))
5907 goto onError;
5908 goto nextByte;
5909 }
5910 x = (x<<4) & ~0xF;
5911 if (c >= '0' && c <= '9')
5912 x += c - '0';
5913 else if (c >= 'a' && c <= 'f')
5914 x += 10 + c - 'a';
5915 else
5916 x += 10 + c - 'A';
5917 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005918 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 /* UCS-2 character */
5920 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005921 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* UCS-4 character. Either store directly, or as
5923 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005924#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005926#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 x -= 0x10000L;
5928 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5929 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005930#endif
5931 } else {
5932 endinpos = s-starts;
5933 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005934 if (unicode_decode_call_errorhandler(
5935 errors, &errorHandler,
5936 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 &starts, &end, &startinpos, &endinpos, &exc, &s,
5938 &v, &outpos, &p))
5939 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005940 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 nextByte:
5942 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005944 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 Py_XDECREF(errorHandler);
5947 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005948#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005949 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005950 Py_DECREF(v);
5951 return NULL;
5952 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005953#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005955
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 Py_XDECREF(errorHandler);
5959 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 return NULL;
5961}
5962
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyObject *
5964PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005965 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 char *p;
5969 char *q;
5970
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005971#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005972 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005973#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005974 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005976
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005977 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005979
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005980 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 if (repr == NULL)
5982 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005983 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 while (size-- > 0) {
5988 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005989#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 /* Map 32-bit characters to '\Uxxxxxxxx' */
5991 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005992 *p++ = '\\';
5993 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005994 *p++ = hexdigits[(ch >> 28) & 0xf];
5995 *p++ = hexdigits[(ch >> 24) & 0xf];
5996 *p++ = hexdigits[(ch >> 20) & 0xf];
5997 *p++ = hexdigits[(ch >> 16) & 0xf];
5998 *p++ = hexdigits[(ch >> 12) & 0xf];
5999 *p++ = hexdigits[(ch >> 8) & 0xf];
6000 *p++ = hexdigits[(ch >> 4) & 0xf];
6001 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006002 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006003 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006004#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6006 if (ch >= 0xD800 && ch < 0xDC00) {
6007 Py_UNICODE ch2;
6008 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 ch2 = *s++;
6011 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006012 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6014 *p++ = '\\';
6015 *p++ = 'U';
6016 *p++ = hexdigits[(ucs >> 28) & 0xf];
6017 *p++ = hexdigits[(ucs >> 24) & 0xf];
6018 *p++ = hexdigits[(ucs >> 20) & 0xf];
6019 *p++ = hexdigits[(ucs >> 16) & 0xf];
6020 *p++ = hexdigits[(ucs >> 12) & 0xf];
6021 *p++ = hexdigits[(ucs >> 8) & 0xf];
6022 *p++ = hexdigits[(ucs >> 4) & 0xf];
6023 *p++ = hexdigits[ucs & 0xf];
6024 continue;
6025 }
6026 /* Fall through: isolated surrogates are copied as-is */
6027 s--;
6028 size++;
6029 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006030#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 /* Map 16-bit characters to '\uxxxx' */
6032 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 *p++ = '\\';
6034 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006035 *p++ = hexdigits[(ch >> 12) & 0xf];
6036 *p++ = hexdigits[(ch >> 8) & 0xf];
6037 *p++ = hexdigits[(ch >> 4) & 0xf];
6038 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 /* Copy everything else as-is */
6041 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 *p++ = (char) ch;
6043 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006044 size = p - q;
6045
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006046 assert(size > 0);
6047 if (_PyBytes_Resize(&repr, size) < 0)
6048 return NULL;
6049 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Alexander Belopolsky40018472011-02-26 01:02:56 +00006052PyObject *
6053PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006055 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006057 PyErr_BadArgument();
6058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006060 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6061 PyUnicode_GET_SIZE(unicode));
6062
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006063 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006066/* --- Unicode Internal Codec ------------------------------------------- */
6067
Alexander Belopolsky40018472011-02-26 01:02:56 +00006068PyObject *
6069_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006070 Py_ssize_t size,
6071 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006072{
6073 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006074 Py_ssize_t startinpos;
6075 Py_ssize_t endinpos;
6076 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 PyUnicodeObject *v;
6078 Py_UNICODE *p;
6079 const char *end;
6080 const char *reason;
6081 PyObject *errorHandler = NULL;
6082 PyObject *exc = NULL;
6083
Neal Norwitzd43069c2006-01-08 01:12:10 +00006084#ifdef Py_UNICODE_WIDE
6085 Py_UNICODE unimax = PyUnicode_GetMax();
6086#endif
6087
Thomas Wouters89f507f2006-12-13 04:49:30 +00006088 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006089 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6090 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006092 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6093 as string was created with the old API. */
6094 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006096 p = PyUnicode_AS_UNICODE(v);
6097 end = s + size;
6098
6099 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006100 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006101 /* We have to sanity check the raw data, otherwise doom looms for
6102 some malformed UCS-4 data. */
6103 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006104#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006106#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 end-s < Py_UNICODE_SIZE
6108 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006110 startinpos = s - starts;
6111 if (end-s < Py_UNICODE_SIZE) {
6112 endinpos = end-starts;
6113 reason = "truncated input";
6114 }
6115 else {
6116 endinpos = s - starts + Py_UNICODE_SIZE;
6117 reason = "illegal code point (> 0x10FFFF)";
6118 }
6119 outpos = p - PyUnicode_AS_UNICODE(v);
6120 if (unicode_decode_call_errorhandler(
6121 errors, &errorHandler,
6122 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006123 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006124 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 goto onError;
6126 }
6127 }
6128 else {
6129 p++;
6130 s += Py_UNICODE_SIZE;
6131 }
6132 }
6133
Victor Stinnerfe226c02011-10-03 03:52:20 +02006134 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006135 goto onError;
6136 Py_XDECREF(errorHandler);
6137 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006138#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006139 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006140 Py_DECREF(v);
6141 return NULL;
6142 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006143#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 return (PyObject *)v;
6145
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006147 Py_XDECREF(v);
6148 Py_XDECREF(errorHandler);
6149 Py_XDECREF(exc);
6150 return NULL;
6151}
6152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153/* --- Latin-1 Codec ------------------------------------------------------ */
6154
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155PyObject *
6156PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006157 Py_ssize_t size,
6158 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006161 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162}
6163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006165static void
6166make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006167 const char *encoding,
6168 const Py_UNICODE *unicode, Py_ssize_t size,
6169 Py_ssize_t startpos, Py_ssize_t endpos,
6170 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 *exceptionObject = PyUnicodeEncodeError_Create(
6174 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
6176 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6178 goto onError;
6179 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6180 goto onError;
6181 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6182 goto onError;
6183 return;
6184 onError:
6185 Py_DECREF(*exceptionObject);
6186 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
6188}
6189
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006191static void
6192raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006193 const char *encoding,
6194 const Py_UNICODE *unicode, Py_ssize_t size,
6195 Py_ssize_t startpos, Py_ssize_t endpos,
6196 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197{
6198 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202}
6203
6204/* error handling callback helper:
6205 build arguments, call the callback and check the arguments,
6206 put the result into newpos and return the replacement string, which
6207 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006208static PyObject *
6209unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006210 PyObject **errorHandler,
6211 const char *encoding, const char *reason,
6212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6213 Py_ssize_t startpos, Py_ssize_t endpos,
6214 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006216 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217
6218 PyObject *restuple;
6219 PyObject *resunicode;
6220
6221 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 }
6226
6227 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231
6232 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006237 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 Py_DECREF(restuple);
6239 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006240 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006241 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 &resunicode, newpos)) {
6243 Py_DECREF(restuple);
6244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006246 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6247 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6248 Py_DECREF(restuple);
6249 return NULL;
6250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006251 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006253 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6255 Py_DECREF(restuple);
6256 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006257 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 Py_INCREF(resunicode);
6259 Py_DECREF(restuple);
6260 return resunicode;
6261}
6262
Alexander Belopolsky40018472011-02-26 01:02:56 +00006263static PyObject *
6264unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006265 Py_ssize_t size,
6266 const char *errors,
6267 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268{
6269 /* output object */
6270 PyObject *res;
6271 /* pointers to the beginning and end+1 of input */
6272 const Py_UNICODE *startp = p;
6273 const Py_UNICODE *endp = p + size;
6274 /* pointer to the beginning of the unencodable characters */
6275 /* const Py_UNICODE *badp = NULL; */
6276 /* pointer into the output */
6277 char *str;
6278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006279 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006280 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6281 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 PyObject *errorHandler = NULL;
6283 PyObject *exc = NULL;
6284 /* the following variable is used for caching string comparisons
6285 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6286 int known_errorHandler = -1;
6287
6288 /* allocate enough for a simple encoding without
6289 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006290 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006291 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006292 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006294 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006295 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 ressize = size;
6297
6298 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 /* can we encode this? */
6302 if (c<limit) {
6303 /* no overflow check, because we know that the space is enough */
6304 *str++ = (char)c;
6305 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 else {
6308 Py_ssize_t unicodepos = p-startp;
6309 Py_ssize_t requiredsize;
6310 PyObject *repunicode;
6311 Py_ssize_t repsize;
6312 Py_ssize_t newpos;
6313 Py_ssize_t respos;
6314 Py_UNICODE *uni2;
6315 /* startpos for collecting unencodable chars */
6316 const Py_UNICODE *collstart = p;
6317 const Py_UNICODE *collend = p;
6318 /* find all unecodable characters */
6319 while ((collend < endp) && ((*collend)>=limit))
6320 ++collend;
6321 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6322 if (known_errorHandler==-1) {
6323 if ((errors==NULL) || (!strcmp(errors, "strict")))
6324 known_errorHandler = 1;
6325 else if (!strcmp(errors, "replace"))
6326 known_errorHandler = 2;
6327 else if (!strcmp(errors, "ignore"))
6328 known_errorHandler = 3;
6329 else if (!strcmp(errors, "xmlcharrefreplace"))
6330 known_errorHandler = 4;
6331 else
6332 known_errorHandler = 0;
6333 }
6334 switch (known_errorHandler) {
6335 case 1: /* strict */
6336 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6337 goto onError;
6338 case 2: /* replace */
6339 while (collstart++<collend)
6340 *str++ = '?'; /* fall through */
6341 case 3: /* ignore */
6342 p = collend;
6343 break;
6344 case 4: /* xmlcharrefreplace */
6345 respos = str - PyBytes_AS_STRING(res);
6346 /* determine replacement size (temporarily (mis)uses p) */
6347 for (p = collstart, repsize = 0; p < collend; ++p) {
6348 if (*p<10)
6349 repsize += 2+1+1;
6350 else if (*p<100)
6351 repsize += 2+2+1;
6352 else if (*p<1000)
6353 repsize += 2+3+1;
6354 else if (*p<10000)
6355 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006356#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 else
6358 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006359#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 else if (*p<100000)
6361 repsize += 2+5+1;
6362 else if (*p<1000000)
6363 repsize += 2+6+1;
6364 else
6365 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006366#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 }
6368 requiredsize = respos+repsize+(endp-collend);
6369 if (requiredsize > ressize) {
6370 if (requiredsize<2*ressize)
6371 requiredsize = 2*ressize;
6372 if (_PyBytes_Resize(&res, requiredsize))
6373 goto onError;
6374 str = PyBytes_AS_STRING(res) + respos;
6375 ressize = requiredsize;
6376 }
6377 /* generate replacement (temporarily (mis)uses p) */
6378 for (p = collstart; p < collend; ++p) {
6379 str += sprintf(str, "&#%d;", (int)*p);
6380 }
6381 p = collend;
6382 break;
6383 default:
6384 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6385 encoding, reason, startp, size, &exc,
6386 collstart-startp, collend-startp, &newpos);
6387 if (repunicode == NULL)
6388 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006389 if (PyBytes_Check(repunicode)) {
6390 /* Directly copy bytes result to output. */
6391 repsize = PyBytes_Size(repunicode);
6392 if (repsize > 1) {
6393 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006394 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006395 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6396 Py_DECREF(repunicode);
6397 goto onError;
6398 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006399 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006400 ressize += repsize-1;
6401 }
6402 memcpy(str, PyBytes_AsString(repunicode), repsize);
6403 str += repsize;
6404 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006406 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 /* need more space? (at least enough for what we
6409 have+the replacement+the rest of the string, so
6410 we won't have to check space for encodable characters) */
6411 respos = str - PyBytes_AS_STRING(res);
6412 repsize = PyUnicode_GET_SIZE(repunicode);
6413 requiredsize = respos+repsize+(endp-collend);
6414 if (requiredsize > ressize) {
6415 if (requiredsize<2*ressize)
6416 requiredsize = 2*ressize;
6417 if (_PyBytes_Resize(&res, requiredsize)) {
6418 Py_DECREF(repunicode);
6419 goto onError;
6420 }
6421 str = PyBytes_AS_STRING(res) + respos;
6422 ressize = requiredsize;
6423 }
6424 /* check if there is anything unencodable in the replacement
6425 and copy it to the output */
6426 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6427 c = *uni2;
6428 if (c >= limit) {
6429 raise_encode_exception(&exc, encoding, startp, size,
6430 unicodepos, unicodepos+1, reason);
6431 Py_DECREF(repunicode);
6432 goto onError;
6433 }
6434 *str = (char)c;
6435 }
6436 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006437 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006439 }
6440 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006441 /* Resize if we allocated to much */
6442 size = str - PyBytes_AS_STRING(res);
6443 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006444 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006445 if (_PyBytes_Resize(&res, size) < 0)
6446 goto onError;
6447 }
6448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006451 return res;
6452
6453 onError:
6454 Py_XDECREF(res);
6455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
6457 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458}
6459
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460PyObject *
6461PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006462 Py_ssize_t size,
6463 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466}
6467
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006469_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470{
6471 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 PyErr_BadArgument();
6473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006475 if (PyUnicode_READY(unicode) == -1)
6476 return NULL;
6477 /* Fast path: if it is a one-byte string, construct
6478 bytes object directly. */
6479 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6480 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6481 PyUnicode_GET_LENGTH(unicode));
6482 /* Non-Latin-1 characters present. Defer to above function to
6483 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006486 errors);
6487}
6488
6489PyObject*
6490PyUnicode_AsLatin1String(PyObject *unicode)
6491{
6492 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
6495/* --- 7-bit ASCII Codec -------------------------------------------------- */
6496
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497PyObject *
6498PyUnicode_DecodeASCII(const char *s,
6499 Py_ssize_t size,
6500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006504 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t startinpos;
6506 Py_ssize_t endinpos;
6507 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006509 int has_error;
6510 const unsigned char *p = (const unsigned char *)s;
6511 const unsigned char *end = p + size;
6512 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 PyObject *errorHandler = NULL;
6514 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006515
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006517 if (size == 1 && (unsigned char)s[0] < 128)
6518 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006519
Victor Stinner702c7342011-10-05 13:50:52 +02006520 has_error = 0;
6521 while (p < end && !has_error) {
6522 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6523 an explanation. */
6524 if (!((size_t) p & LONG_PTR_MASK)) {
6525 /* Help register allocation */
6526 register const unsigned char *_p = p;
6527 while (_p < aligned_end) {
6528 unsigned long value = *(unsigned long *) _p;
6529 if (value & ASCII_CHAR_MASK) {
6530 has_error = 1;
6531 break;
6532 }
6533 _p += SIZEOF_LONG;
6534 }
6535 if (_p == end)
6536 break;
6537 if (has_error)
6538 break;
6539 p = _p;
6540 }
6541 if (*p & 0x80) {
6542 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006543 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006544 }
6545 else {
6546 ++p;
6547 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006548 }
Victor Stinner702c7342011-10-05 13:50:52 +02006549 if (!has_error)
6550 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006551
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 v = _PyUnicode_New(size);
6553 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006557 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 e = s + size;
6559 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 register unsigned char c = (unsigned char)*s;
6561 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006562 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 ++s;
6564 }
6565 else {
6566 startinpos = s-starts;
6567 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006568 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 if (unicode_decode_call_errorhandler(
6570 errors, &errorHandler,
6571 "ascii", "ordinal not in range(128)",
6572 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006573 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 goto onError;
6575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Victor Stinner702c7342011-10-05 13:50:52 +02006577 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6578 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006582#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006583 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006584 Py_DECREF(v);
6585 return NULL;
6586 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006587#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006589
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 return NULL;
6595}
6596
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597PyObject *
6598PyUnicode_EncodeASCII(const Py_UNICODE *p,
6599 Py_ssize_t size,
6600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603}
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006606_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607{
6608 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 PyErr_BadArgument();
6610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006612 if (PyUnicode_READY(unicode) == -1)
6613 return NULL;
6614 /* Fast path: if it is an ASCII-only string, construct bytes object
6615 directly. Else defer to above function to raise the exception. */
6616 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6617 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6618 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006621 errors);
6622}
6623
6624PyObject *
6625PyUnicode_AsASCIIString(PyObject *unicode)
6626{
6627 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628}
6629
Victor Stinner99b95382011-07-04 14:23:54 +02006630#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006631
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006632/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006633
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006634#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006635#define NEED_RETRY
6636#endif
6637
6638/* XXX This code is limited to "true" double-byte encodings, as
6639 a) it assumes an incomplete character consists of a single byte, and
6640 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006642
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643static int
6644is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006645{
6646 const char *curr = s + offset;
6647
6648 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 const char *prev = CharPrev(s, curr);
6650 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651 }
6652 return 0;
6653}
6654
6655/*
6656 * Decode MBCS string into unicode object. If 'final' is set, converts
6657 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6658 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006659static int
6660decode_mbcs(PyUnicodeObject **v,
6661 const char *s, /* MBCS string */
6662 int size, /* sizeof MBCS string */
6663 int final,
6664 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665{
6666 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006667 Py_ssize_t n;
6668 DWORD usize;
6669 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670
6671 assert(size >= 0);
6672
Victor Stinner554f3f02010-06-16 23:33:54 +00006673 /* check and handle 'errors' arg */
6674 if (errors==NULL || strcmp(errors, "strict")==0)
6675 flags = MB_ERR_INVALID_CHARS;
6676 else if (strcmp(errors, "ignore")==0)
6677 flags = 0;
6678 else {
6679 PyErr_Format(PyExc_ValueError,
6680 "mbcs encoding does not support errors='%s'",
6681 errors);
6682 return -1;
6683 }
6684
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006685 /* Skip trailing lead-byte unless 'final' is set */
6686 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006688
6689 /* First get the size of the result */
6690 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006691 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6692 if (usize==0)
6693 goto mbcs_decode_error;
6694 } else
6695 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696
6697 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 /* Create unicode object */
6699 *v = _PyUnicode_New(usize);
6700 if (*v == NULL)
6701 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006702 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 }
6704 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 /* Extend unicode object */
6706 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006707 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006709 }
6710
6711 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006712 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006714 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6715 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006719
6720mbcs_decode_error:
6721 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6722 we raise a UnicodeDecodeError - else it is a 'generic'
6723 windows error
6724 */
6725 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6726 /* Ideally, we should get reason from FormatMessage - this
6727 is the Windows 2000 English version of the message
6728 */
6729 PyObject *exc = NULL;
6730 const char *reason = "No mapping for the Unicode character exists "
6731 "in the target multi-byte code page.";
6732 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6733 if (exc != NULL) {
6734 PyCodec_StrictErrors(exc);
6735 Py_DECREF(exc);
6736 }
6737 } else {
6738 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6739 }
6740 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006741}
6742
Alexander Belopolsky40018472011-02-26 01:02:56 +00006743PyObject *
6744PyUnicode_DecodeMBCSStateful(const char *s,
6745 Py_ssize_t size,
6746 const char *errors,
6747 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006748{
6749 PyUnicodeObject *v = NULL;
6750 int done;
6751
6752 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006754
6755#ifdef NEED_RETRY
6756 retry:
6757 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006758 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759 else
6760#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006761 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762
6763 if (done < 0) {
6764 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766 }
6767
6768 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770
6771#ifdef NEED_RETRY
6772 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 s += done;
6774 size -= done;
6775 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006776 }
6777#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006778#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006779 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 Py_DECREF(v);
6781 return NULL;
6782 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006783#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784 return (PyObject *)v;
6785}
6786
Alexander Belopolsky40018472011-02-26 01:02:56 +00006787PyObject *
6788PyUnicode_DecodeMBCS(const char *s,
6789 Py_ssize_t size,
6790 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006791{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006792 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6793}
6794
6795/*
6796 * Convert unicode into string object (MBCS).
6797 * Returns 0 if succeed, -1 otherwise.
6798 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799static int
6800encode_mbcs(PyObject **repr,
6801 const Py_UNICODE *p, /* unicode */
6802 int size, /* size of unicode */
6803 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804{
Victor Stinner554f3f02010-06-16 23:33:54 +00006805 BOOL usedDefaultChar = FALSE;
6806 BOOL *pusedDefaultChar;
6807 int mbcssize;
6808 Py_ssize_t n;
6809 PyObject *exc = NULL;
6810 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811
6812 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006813
Victor Stinner554f3f02010-06-16 23:33:54 +00006814 /* check and handle 'errors' arg */
6815 if (errors==NULL || strcmp(errors, "strict")==0) {
6816 flags = WC_NO_BEST_FIT_CHARS;
6817 pusedDefaultChar = &usedDefaultChar;
6818 } else if (strcmp(errors, "replace")==0) {
6819 flags = 0;
6820 pusedDefaultChar = NULL;
6821 } else {
6822 PyErr_Format(PyExc_ValueError,
6823 "mbcs encoding does not support errors='%s'",
6824 errors);
6825 return -1;
6826 }
6827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006828 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006830 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6831 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 if (mbcssize == 0) {
6833 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6834 return -1;
6835 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006836 /* If we used a default char, then we failed! */
6837 if (pusedDefaultChar && *pusedDefaultChar)
6838 goto mbcs_encode_error;
6839 } else {
6840 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006841 }
6842
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 /* Create string object */
6845 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6846 if (*repr == NULL)
6847 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006848 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849 }
6850 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 /* Extend string object */
6852 n = PyBytes_Size(*repr);
6853 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6854 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855 }
6856
6857 /* Do the conversion */
6858 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006860 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6861 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6863 return -1;
6864 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006865 if (pusedDefaultChar && *pusedDefaultChar)
6866 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006869
6870mbcs_encode_error:
6871 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6872 Py_XDECREF(exc);
6873 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006874}
6875
Alexander Belopolsky40018472011-02-26 01:02:56 +00006876PyObject *
6877PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6878 Py_ssize_t size,
6879 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006880{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881 PyObject *repr = NULL;
6882 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006883
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006887 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 else
6889#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006890 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006891
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 Py_XDECREF(repr);
6894 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006895 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896
6897#ifdef NEED_RETRY
6898 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 p += INT_MAX;
6900 size -= INT_MAX;
6901 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902 }
6903#endif
6904
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006905 return repr;
6906}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006907
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908PyObject *
6909PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006910{
6911 if (!PyUnicode_Check(unicode)) {
6912 PyErr_BadArgument();
6913 return NULL;
6914 }
6915 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 PyUnicode_GET_SIZE(unicode),
6917 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006918}
6919
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920#undef NEED_RETRY
6921
Victor Stinner99b95382011-07-04 14:23:54 +02006922#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924/* --- Character Mapping Codec -------------------------------------------- */
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_DecodeCharmap(const char *s,
6928 Py_ssize_t size,
6929 PyObject *mapping,
6930 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t startinpos;
6934 Py_ssize_t endinpos;
6935 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 PyUnicodeObject *v;
6938 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 PyObject *errorHandler = NULL;
6941 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006942 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006943 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* Default to Latin-1 */
6946 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
6949 v = _PyUnicode_New(size);
6950 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006956 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 mapstring = PyUnicode_AS_UNICODE(mapping);
6958 maplen = PyUnicode_GET_SIZE(mapping);
6959 while (s < e) {
6960 unsigned char ch = *s;
6961 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 if (ch < maplen)
6964 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 if (x == 0xfffe) {
6967 /* undefined mapping */
6968 outpos = p-PyUnicode_AS_UNICODE(v);
6969 startinpos = s-starts;
6970 endinpos = startinpos+1;
6971 if (unicode_decode_call_errorhandler(
6972 errors, &errorHandler,
6973 "charmap", "character maps to <undefined>",
6974 &starts, &e, &startinpos, &endinpos, &exc, &s,
6975 &v, &outpos, &p)) {
6976 goto onError;
6977 }
6978 continue;
6979 }
6980 *p++ = x;
6981 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006982 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006983 }
6984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 while (s < e) {
6986 unsigned char ch = *s;
6987 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6990 w = PyLong_FromLong((long)ch);
6991 if (w == NULL)
6992 goto onError;
6993 x = PyObject_GetItem(mapping, w);
6994 Py_DECREF(w);
6995 if (x == NULL) {
6996 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6997 /* No mapping found means: mapping is undefined. */
6998 PyErr_Clear();
6999 x = Py_None;
7000 Py_INCREF(x);
7001 } else
7002 goto onError;
7003 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007004
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 /* Apply mapping */
7006 if (PyLong_Check(x)) {
7007 long value = PyLong_AS_LONG(x);
7008 if (value < 0 || value > 65535) {
7009 PyErr_SetString(PyExc_TypeError,
7010 "character mapping must be in range(65536)");
7011 Py_DECREF(x);
7012 goto onError;
7013 }
7014 *p++ = (Py_UNICODE)value;
7015 }
7016 else if (x == Py_None) {
7017 /* undefined mapping */
7018 outpos = p-PyUnicode_AS_UNICODE(v);
7019 startinpos = s-starts;
7020 endinpos = startinpos+1;
7021 if (unicode_decode_call_errorhandler(
7022 errors, &errorHandler,
7023 "charmap", "character maps to <undefined>",
7024 &starts, &e, &startinpos, &endinpos, &exc, &s,
7025 &v, &outpos, &p)) {
7026 Py_DECREF(x);
7027 goto onError;
7028 }
7029 Py_DECREF(x);
7030 continue;
7031 }
7032 else if (PyUnicode_Check(x)) {
7033 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007034
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 if (targetsize == 1)
7036 /* 1-1 mapping */
7037 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007038
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 else if (targetsize > 1) {
7040 /* 1-n mapping */
7041 if (targetsize > extrachars) {
7042 /* resize first */
7043 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7044 Py_ssize_t needed = (targetsize - extrachars) + \
7045 (targetsize << 2);
7046 extrachars += needed;
7047 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007048 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 PyUnicode_GET_SIZE(v) + needed) < 0) {
7050 Py_DECREF(x);
7051 goto onError;
7052 }
7053 p = PyUnicode_AS_UNICODE(v) + oldpos;
7054 }
7055 Py_UNICODE_COPY(p,
7056 PyUnicode_AS_UNICODE(x),
7057 targetsize);
7058 p += targetsize;
7059 extrachars -= targetsize;
7060 }
7061 /* 1-0 mapping: skip the character */
7062 }
7063 else {
7064 /* wrong return value */
7065 PyErr_SetString(PyExc_TypeError,
7066 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007067 Py_DECREF(x);
7068 goto onError;
7069 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 Py_DECREF(x);
7071 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 }
7074 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007075 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077 Py_XDECREF(errorHandler);
7078 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007079#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007080 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007081 Py_DECREF(v);
7082 return NULL;
7083 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007084#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007086
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 Py_XDECREF(errorHandler);
7089 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 Py_XDECREF(v);
7091 return NULL;
7092}
7093
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007094/* Charmap encoding: the lookup table */
7095
Alexander Belopolsky40018472011-02-26 01:02:56 +00007096struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 PyObject_HEAD
7098 unsigned char level1[32];
7099 int count2, count3;
7100 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007101};
7102
7103static PyObject*
7104encoding_map_size(PyObject *obj, PyObject* args)
7105{
7106 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007107 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007109}
7110
7111static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007112 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 PyDoc_STR("Return the size (in bytes) of this object") },
7114 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007115};
7116
7117static void
7118encoding_map_dealloc(PyObject* o)
7119{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007120 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007121}
7122
7123static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007124 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 "EncodingMap", /*tp_name*/
7126 sizeof(struct encoding_map), /*tp_basicsize*/
7127 0, /*tp_itemsize*/
7128 /* methods */
7129 encoding_map_dealloc, /*tp_dealloc*/
7130 0, /*tp_print*/
7131 0, /*tp_getattr*/
7132 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007133 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 0, /*tp_repr*/
7135 0, /*tp_as_number*/
7136 0, /*tp_as_sequence*/
7137 0, /*tp_as_mapping*/
7138 0, /*tp_hash*/
7139 0, /*tp_call*/
7140 0, /*tp_str*/
7141 0, /*tp_getattro*/
7142 0, /*tp_setattro*/
7143 0, /*tp_as_buffer*/
7144 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7145 0, /*tp_doc*/
7146 0, /*tp_traverse*/
7147 0, /*tp_clear*/
7148 0, /*tp_richcompare*/
7149 0, /*tp_weaklistoffset*/
7150 0, /*tp_iter*/
7151 0, /*tp_iternext*/
7152 encoding_map_methods, /*tp_methods*/
7153 0, /*tp_members*/
7154 0, /*tp_getset*/
7155 0, /*tp_base*/
7156 0, /*tp_dict*/
7157 0, /*tp_descr_get*/
7158 0, /*tp_descr_set*/
7159 0, /*tp_dictoffset*/
7160 0, /*tp_init*/
7161 0, /*tp_alloc*/
7162 0, /*tp_new*/
7163 0, /*tp_free*/
7164 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007165};
7166
7167PyObject*
7168PyUnicode_BuildEncodingMap(PyObject* string)
7169{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007170 PyObject *result;
7171 struct encoding_map *mresult;
7172 int i;
7173 int need_dict = 0;
7174 unsigned char level1[32];
7175 unsigned char level2[512];
7176 unsigned char *mlevel1, *mlevel2, *mlevel3;
7177 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007178 int kind;
7179 void *data;
7180 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007182 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007183 PyErr_BadArgument();
7184 return NULL;
7185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007186 kind = PyUnicode_KIND(string);
7187 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188 memset(level1, 0xFF, sizeof level1);
7189 memset(level2, 0xFF, sizeof level2);
7190
7191 /* If there isn't a one-to-one mapping of NULL to \0,
7192 or if there are non-BMP characters, we need to use
7193 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007195 need_dict = 1;
7196 for (i = 1; i < 256; i++) {
7197 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007198 ch = PyUnicode_READ(kind, data, i);
7199 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007200 need_dict = 1;
7201 break;
7202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007203 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007204 /* unmapped character */
7205 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007206 l1 = ch >> 11;
7207 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007208 if (level1[l1] == 0xFF)
7209 level1[l1] = count2++;
7210 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007211 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007212 }
7213
7214 if (count2 >= 0xFF || count3 >= 0xFF)
7215 need_dict = 1;
7216
7217 if (need_dict) {
7218 PyObject *result = PyDict_New();
7219 PyObject *key, *value;
7220 if (!result)
7221 return NULL;
7222 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007224 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007225 if (!key || !value)
7226 goto failed1;
7227 if (PyDict_SetItem(result, key, value) == -1)
7228 goto failed1;
7229 Py_DECREF(key);
7230 Py_DECREF(value);
7231 }
7232 return result;
7233 failed1:
7234 Py_XDECREF(key);
7235 Py_XDECREF(value);
7236 Py_DECREF(result);
7237 return NULL;
7238 }
7239
7240 /* Create a three-level trie */
7241 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7242 16*count2 + 128*count3 - 1);
7243 if (!result)
7244 return PyErr_NoMemory();
7245 PyObject_Init(result, &EncodingMapType);
7246 mresult = (struct encoding_map*)result;
7247 mresult->count2 = count2;
7248 mresult->count3 = count3;
7249 mlevel1 = mresult->level1;
7250 mlevel2 = mresult->level23;
7251 mlevel3 = mresult->level23 + 16*count2;
7252 memcpy(mlevel1, level1, 32);
7253 memset(mlevel2, 0xFF, 16*count2);
7254 memset(mlevel3, 0, 128*count3);
7255 count3 = 0;
7256 for (i = 1; i < 256; i++) {
7257 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007258 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007259 /* unmapped character */
7260 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 o1 = PyUnicode_READ(kind, data, i)>>11;
7262 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007263 i2 = 16*mlevel1[o1] + o2;
7264 if (mlevel2[i2] == 0xFF)
7265 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007266 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 i3 = 128*mlevel2[i2] + o3;
7268 mlevel3[i3] = i;
7269 }
7270 return result;
7271}
7272
7273static int
7274encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7275{
7276 struct encoding_map *map = (struct encoding_map*)mapping;
7277 int l1 = c>>11;
7278 int l2 = (c>>7) & 0xF;
7279 int l3 = c & 0x7F;
7280 int i;
7281
7282#ifdef Py_UNICODE_WIDE
7283 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007285 }
7286#endif
7287 if (c == 0)
7288 return 0;
7289 /* level 1*/
7290 i = map->level1[l1];
7291 if (i == 0xFF) {
7292 return -1;
7293 }
7294 /* level 2*/
7295 i = map->level23[16*i+l2];
7296 if (i == 0xFF) {
7297 return -1;
7298 }
7299 /* level 3 */
7300 i = map->level23[16*map->count2 + 128*i + l3];
7301 if (i == 0) {
7302 return -1;
7303 }
7304 return i;
7305}
7306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307/* Lookup the character ch in the mapping. If the character
7308 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007309 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007310static PyObject *
7311charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312{
Christian Heimes217cfd12007-12-02 14:31:20 +00007313 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007314 PyObject *x;
7315
7316 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 x = PyObject_GetItem(mapping, w);
7319 Py_DECREF(w);
7320 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7322 /* No mapping found means: mapping is undefined. */
7323 PyErr_Clear();
7324 x = Py_None;
7325 Py_INCREF(x);
7326 return x;
7327 } else
7328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007330 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007332 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 long value = PyLong_AS_LONG(x);
7334 if (value < 0 || value > 255) {
7335 PyErr_SetString(PyExc_TypeError,
7336 "character mapping must be in range(256)");
7337 Py_DECREF(x);
7338 return NULL;
7339 }
7340 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007342 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 /* wrong return value */
7346 PyErr_Format(PyExc_TypeError,
7347 "character mapping must return integer, bytes or None, not %.400s",
7348 x->ob_type->tp_name);
7349 Py_DECREF(x);
7350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 }
7352}
7353
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007354static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007355charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007356{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007357 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7358 /* exponentially overallocate to minimize reallocations */
7359 if (requiredsize < 2*outsize)
7360 requiredsize = 2*outsize;
7361 if (_PyBytes_Resize(outobj, requiredsize))
7362 return -1;
7363 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007364}
7365
Benjamin Peterson14339b62009-01-31 16:36:08 +00007366typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007368} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007370 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007371 space is available. Return a new reference to the object that
7372 was put in the output buffer, or Py_None, if the mapping was undefined
7373 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007374 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007375static charmapencode_result
7376charmapencode_output(Py_UNICODE c, PyObject *mapping,
7377 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007378{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007379 PyObject *rep;
7380 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007381 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382
Christian Heimes90aa7642007-12-19 02:45:37 +00007383 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007386 if (res == -1)
7387 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 if (outsize<requiredsize)
7389 if (charmapencode_resize(outobj, outpos, requiredsize))
7390 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007391 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 outstart[(*outpos)++] = (char)res;
7393 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007394 }
7395
7396 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007397 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007399 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 Py_DECREF(rep);
7401 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007402 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 if (PyLong_Check(rep)) {
7404 Py_ssize_t requiredsize = *outpos+1;
7405 if (outsize<requiredsize)
7406 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7407 Py_DECREF(rep);
7408 return enc_EXCEPTION;
7409 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007410 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007412 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 else {
7414 const char *repchars = PyBytes_AS_STRING(rep);
7415 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7416 Py_ssize_t requiredsize = *outpos+repsize;
7417 if (outsize<requiredsize)
7418 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7419 Py_DECREF(rep);
7420 return enc_EXCEPTION;
7421 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007422 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 memcpy(outstart + *outpos, repchars, repsize);
7424 *outpos += repsize;
7425 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007426 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007427 Py_DECREF(rep);
7428 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007429}
7430
7431/* handle an error in PyUnicode_EncodeCharmap
7432 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007433static int
7434charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007435 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007437 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007438 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007439{
7440 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t repsize;
7442 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 Py_UNICODE *uni2;
7444 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445 Py_ssize_t collstartpos = *inpos;
7446 Py_ssize_t collendpos = *inpos+1;
7447 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007448 char *encoding = "charmap";
7449 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007450 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007452 /* find all unencodable characters */
7453 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007454 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007455 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 int res = encoding_map_lookup(p[collendpos], mapping);
7457 if (res != -1)
7458 break;
7459 ++collendpos;
7460 continue;
7461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007462
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 rep = charmapencode_lookup(p[collendpos], mapping);
7464 if (rep==NULL)
7465 return -1;
7466 else if (rep!=Py_None) {
7467 Py_DECREF(rep);
7468 break;
7469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007470 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472 }
7473 /* cache callback name lookup
7474 * (if not done yet, i.e. it's the first error) */
7475 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 if ((errors==NULL) || (!strcmp(errors, "strict")))
7477 *known_errorHandler = 1;
7478 else if (!strcmp(errors, "replace"))
7479 *known_errorHandler = 2;
7480 else if (!strcmp(errors, "ignore"))
7481 *known_errorHandler = 3;
7482 else if (!strcmp(errors, "xmlcharrefreplace"))
7483 *known_errorHandler = 4;
7484 else
7485 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 }
7487 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007488 case 1: /* strict */
7489 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7490 return -1;
7491 case 2: /* replace */
7492 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 x = charmapencode_output('?', mapping, res, respos);
7494 if (x==enc_EXCEPTION) {
7495 return -1;
7496 }
7497 else if (x==enc_FAILED) {
7498 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7499 return -1;
7500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007501 }
7502 /* fall through */
7503 case 3: /* ignore */
7504 *inpos = collendpos;
7505 break;
7506 case 4: /* xmlcharrefreplace */
7507 /* generate replacement (temporarily (mis)uses p) */
7508 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 char buffer[2+29+1+1];
7510 char *cp;
7511 sprintf(buffer, "&#%d;", (int)p[collpos]);
7512 for (cp = buffer; *cp; ++cp) {
7513 x = charmapencode_output(*cp, mapping, res, respos);
7514 if (x==enc_EXCEPTION)
7515 return -1;
7516 else if (x==enc_FAILED) {
7517 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7518 return -1;
7519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007520 }
7521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007522 *inpos = collendpos;
7523 break;
7524 default:
7525 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 encoding, reason, p, size, exceptionObject,
7527 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007528 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007530 if (PyBytes_Check(repunicode)) {
7531 /* Directly copy bytes result to output. */
7532 Py_ssize_t outsize = PyBytes_Size(*res);
7533 Py_ssize_t requiredsize;
7534 repsize = PyBytes_Size(repunicode);
7535 requiredsize = *respos + repsize;
7536 if (requiredsize > outsize)
7537 /* Make room for all additional bytes. */
7538 if (charmapencode_resize(res, respos, requiredsize)) {
7539 Py_DECREF(repunicode);
7540 return -1;
7541 }
7542 memcpy(PyBytes_AsString(*res) + *respos,
7543 PyBytes_AsString(repunicode), repsize);
7544 *respos += repsize;
7545 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007546 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007547 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007548 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007549 /* generate replacement */
7550 repsize = PyUnicode_GET_SIZE(repunicode);
7551 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 x = charmapencode_output(*uni2, mapping, res, respos);
7553 if (x==enc_EXCEPTION) {
7554 return -1;
7555 }
7556 else if (x==enc_FAILED) {
7557 Py_DECREF(repunicode);
7558 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7559 return -1;
7560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007561 }
7562 *inpos = newpos;
7563 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 }
7565 return 0;
7566}
7567
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568PyObject *
7569PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7570 Py_ssize_t size,
7571 PyObject *mapping,
7572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007574 /* output object */
7575 PyObject *res = NULL;
7576 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007577 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007579 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 PyObject *errorHandler = NULL;
7581 PyObject *exc = NULL;
7582 /* the following variable is used for caching string comparisons
7583 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7584 * 3=ignore, 4=xmlcharrefreplace */
7585 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
7587 /* Default to Latin-1 */
7588 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 /* allocate enough for a simple encoding without
7592 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007593 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 if (res == NULL)
7595 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007596 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 /* try to encode it */
7601 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7602 if (x==enc_EXCEPTION) /* error */
7603 goto onError;
7604 if (x==enc_FAILED) { /* unencodable character */
7605 if (charmap_encoding_error(p, size, &inpos, mapping,
7606 &exc,
7607 &known_errorHandler, &errorHandler, errors,
7608 &res, &respos)) {
7609 goto onError;
7610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 else
7613 /* done with this character => adjust input position */
7614 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007618 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007619 if (_PyBytes_Resize(&res, respos) < 0)
7620 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007622 Py_XDECREF(exc);
7623 Py_XDECREF(errorHandler);
7624 return res;
7625
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007627 Py_XDECREF(res);
7628 Py_XDECREF(exc);
7629 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 return NULL;
7631}
7632
Alexander Belopolsky40018472011-02-26 01:02:56 +00007633PyObject *
7634PyUnicode_AsCharmapString(PyObject *unicode,
7635 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636{
7637 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 PyErr_BadArgument();
7639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 }
7641 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 PyUnicode_GET_SIZE(unicode),
7643 mapping,
7644 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007648static void
7649make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007650 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007651 Py_ssize_t startpos, Py_ssize_t endpos,
7652 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007655 *exceptionObject = _PyUnicodeTranslateError_Create(
7656 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 }
7658 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7660 goto onError;
7661 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7662 goto onError;
7663 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7664 goto onError;
7665 return;
7666 onError:
7667 Py_DECREF(*exceptionObject);
7668 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 }
7670}
7671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007673static void
7674raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007675 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007676 Py_ssize_t startpos, Py_ssize_t endpos,
7677 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678{
7679 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683}
7684
7685/* error handling callback helper:
7686 build arguments, call the callback and check the arguments,
7687 put the result into newpos and return the replacement string, which
7688 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007689static PyObject *
7690unicode_translate_call_errorhandler(const char *errors,
7691 PyObject **errorHandler,
7692 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007694 Py_ssize_t startpos, Py_ssize_t endpos,
7695 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007697 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007699 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007700 PyObject *restuple;
7701 PyObject *resunicode;
7702
7703 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 }
7708
7709 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007711 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713
7714 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007718 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007719 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 Py_DECREF(restuple);
7721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 }
7723 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 &resunicode, &i_newpos)) {
7725 Py_DECREF(restuple);
7726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007728 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007729 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007730 else
7731 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7734 Py_DECREF(restuple);
7735 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737 Py_INCREF(resunicode);
7738 Py_DECREF(restuple);
7739 return resunicode;
7740}
7741
7742/* Lookup the character ch in the mapping and put the result in result,
7743 which must be decrefed by the caller.
7744 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007745static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007747{
Christian Heimes217cfd12007-12-02 14:31:20 +00007748 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 PyObject *x;
7750
7751 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 x = PyObject_GetItem(mapping, w);
7754 Py_DECREF(w);
7755 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7757 /* No mapping found means: use 1:1 mapping. */
7758 PyErr_Clear();
7759 *result = NULL;
7760 return 0;
7761 } else
7762 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 }
7764 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 *result = x;
7766 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007768 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 long value = PyLong_AS_LONG(x);
7770 long max = PyUnicode_GetMax();
7771 if (value < 0 || value > max) {
7772 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007773 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_DECREF(x);
7775 return -1;
7776 }
7777 *result = x;
7778 return 0;
7779 }
7780 else if (PyUnicode_Check(x)) {
7781 *result = x;
7782 return 0;
7783 }
7784 else {
7785 /* wrong return value */
7786 PyErr_SetString(PyExc_TypeError,
7787 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 Py_DECREF(x);
7789 return -1;
7790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007791}
7792/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 if not reallocate and adjust various state variables.
7794 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007795static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007800 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 /* exponentially overallocate to minimize reallocations */
7802 if (requiredsize < 2 * oldsize)
7803 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7805 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 }
7809 return 0;
7810}
7811/* lookup the character, put the result in the output string and adjust
7812 various state variables. Return a new reference to the object that
7813 was put in the output buffer in *result, or Py_None, if the mapping was
7814 undefined (in which case no character was written).
7815 The called must decref result.
7816 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7819 PyObject *mapping, Py_UCS4 **output,
7820 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007821 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7824 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007829 }
7830 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007832 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 }
7836 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007837 Py_ssize_t repsize;
7838 if (PyUnicode_READY(*res) == -1)
7839 return -1;
7840 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (repsize==1) {
7842 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 }
7845 else if (repsize!=0) {
7846 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 Py_ssize_t requiredsize = *opos +
7848 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 Py_ssize_t i;
7851 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007853 for(i = 0; i < repsize; i++)
7854 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007856 }
7857 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859 return 0;
7860}
7861
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863_PyUnicode_TranslateCharmap(PyObject *input,
7864 PyObject *mapping,
7865 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 /* input object */
7868 char *idata;
7869 Py_ssize_t size, i;
7870 int kind;
7871 /* output buffer */
7872 Py_UCS4 *output = NULL;
7873 Py_ssize_t osize;
7874 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 char *reason = "character maps to <undefined>";
7878 PyObject *errorHandler = NULL;
7879 PyObject *exc = NULL;
7880 /* the following variable is used for caching string comparisons
7881 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7882 * 3=ignore, 4=xmlcharrefreplace */
7883 int known_errorHandler = -1;
7884
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 PyErr_BadArgument();
7887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 if (PyUnicode_READY(input) == -1)
7891 return NULL;
7892 idata = (char*)PyUnicode_DATA(input);
7893 kind = PyUnicode_KIND(input);
7894 size = PyUnicode_GET_LENGTH(input);
7895 i = 0;
7896
7897 if (size == 0) {
7898 Py_INCREF(input);
7899 return input;
7900 }
7901
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 /* allocate enough for a simple 1:1 translation without
7903 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 osize = size;
7905 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7906 opos = 0;
7907 if (output == NULL) {
7908 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 /* try to encode it */
7914 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 if (charmaptranslate_output(input, i, mapping,
7916 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 Py_XDECREF(x);
7918 goto onError;
7919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 else { /* untranslatable character */
7924 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7925 Py_ssize_t repsize;
7926 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 Py_ssize_t collstart = i;
7930 Py_ssize_t collend = i+1;
7931 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 while (collend < size) {
7935 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 goto onError;
7937 Py_XDECREF(x);
7938 if (x!=Py_None)
7939 break;
7940 ++collend;
7941 }
7942 /* cache callback name lookup
7943 * (if not done yet, i.e. it's the first error) */
7944 if (known_errorHandler==-1) {
7945 if ((errors==NULL) || (!strcmp(errors, "strict")))
7946 known_errorHandler = 1;
7947 else if (!strcmp(errors, "replace"))
7948 known_errorHandler = 2;
7949 else if (!strcmp(errors, "ignore"))
7950 known_errorHandler = 3;
7951 else if (!strcmp(errors, "xmlcharrefreplace"))
7952 known_errorHandler = 4;
7953 else
7954 known_errorHandler = 0;
7955 }
7956 switch (known_errorHandler) {
7957 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 raise_translate_exception(&exc, input, collstart,
7959 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007960 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 case 2: /* replace */
7962 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 for (coll = collstart; coll<collend; coll++)
7964 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 /* fall through */
7966 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 break;
7969 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007970 /* generate replacement (temporarily (mis)uses i) */
7971 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 char buffer[2+29+1+1];
7973 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7975 if (charmaptranslate_makespace(&output, &osize,
7976 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 goto onError;
7978 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 break;
7983 default:
7984 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 reason, input, &exc,
7986 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007987 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 goto onError;
7989 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 repsize = PyUnicode_GET_LENGTH(repunicode);
7991 if (charmaptranslate_makespace(&output, &osize,
7992 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 Py_DECREF(repunicode);
7994 goto onError;
7995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007996 for (uni2 = 0; repsize-->0; ++uni2)
7997 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7998 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008001 }
8002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8004 if (!res)
8005 goto onError;
8006 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007 Py_XDECREF(exc);
8008 Py_XDECREF(errorHandler);
8009 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008012 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 Py_XDECREF(exc);
8014 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 return NULL;
8016}
8017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008018/* Deprecated. Use PyUnicode_Translate instead. */
8019PyObject *
8020PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8021 Py_ssize_t size,
8022 PyObject *mapping,
8023 const char *errors)
8024{
8025 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8026 if (!unicode)
8027 return NULL;
8028 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8029}
8030
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031PyObject *
8032PyUnicode_Translate(PyObject *str,
8033 PyObject *mapping,
8034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
8036 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008037
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 str = PyUnicode_FromObject(str);
8039 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 Py_DECREF(str);
8043 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008044
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 Py_XDECREF(str);
8047 return NULL;
8048}
Tim Petersced69f82003-09-16 20:30:58 +00008049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050static Py_UCS4
8051fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8052{
8053 /* No need to call PyUnicode_READY(self) because this function is only
8054 called as a callback from fixup() which does it already. */
8055 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8056 const int kind = PyUnicode_KIND(self);
8057 void *data = PyUnicode_DATA(self);
8058 Py_UCS4 maxchar = 0, ch, fixed;
8059 Py_ssize_t i;
8060
8061 for (i = 0; i < len; ++i) {
8062 ch = PyUnicode_READ(kind, data, i);
8063 fixed = 0;
8064 if (ch > 127) {
8065 if (Py_UNICODE_ISSPACE(ch))
8066 fixed = ' ';
8067 else {
8068 const int decimal = Py_UNICODE_TODECIMAL(ch);
8069 if (decimal >= 0)
8070 fixed = '0' + decimal;
8071 }
8072 if (fixed != 0) {
8073 if (fixed > maxchar)
8074 maxchar = fixed;
8075 PyUnicode_WRITE(kind, data, i, fixed);
8076 }
8077 else if (ch > maxchar)
8078 maxchar = ch;
8079 }
8080 else if (ch > maxchar)
8081 maxchar = ch;
8082 }
8083
8084 return maxchar;
8085}
8086
8087PyObject *
8088_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8089{
8090 if (!PyUnicode_Check(unicode)) {
8091 PyErr_BadInternalCall();
8092 return NULL;
8093 }
8094 if (PyUnicode_READY(unicode) == -1)
8095 return NULL;
8096 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8097 /* If the string is already ASCII, just return the same string */
8098 Py_INCREF(unicode);
8099 return unicode;
8100 }
8101 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8102}
8103
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008104PyObject *
8105PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8106 Py_ssize_t length)
8107{
8108 PyObject *result;
8109 Py_UNICODE *p; /* write pointer into result */
8110 Py_ssize_t i;
8111 /* Copy to a new string */
8112 result = (PyObject *)_PyUnicode_New(length);
8113 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8114 if (result == NULL)
8115 return result;
8116 p = PyUnicode_AS_UNICODE(result);
8117 /* Iterate over code points */
8118 for (i = 0; i < length; i++) {
8119 Py_UNICODE ch =s[i];
8120 if (ch > 127) {
8121 int decimal = Py_UNICODE_TODECIMAL(ch);
8122 if (decimal >= 0)
8123 p[i] = '0' + decimal;
8124 }
8125 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008126#ifndef DONT_MAKE_RESULT_READY
8127 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 Py_DECREF(result);
8129 return NULL;
8130 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008131#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008132 return result;
8133}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008134/* --- Decimal Encoder ---------------------------------------------------- */
8135
Alexander Belopolsky40018472011-02-26 01:02:56 +00008136int
8137PyUnicode_EncodeDecimal(Py_UNICODE *s,
8138 Py_ssize_t length,
8139 char *output,
8140 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008141{
8142 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 PyObject *errorHandler = NULL;
8144 PyObject *exc = NULL;
8145 const char *encoding = "decimal";
8146 const char *reason = "invalid decimal Unicode string";
8147 /* the following variable is used for caching string comparisons
8148 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8149 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008150
8151 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 PyErr_BadArgument();
8153 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008154 }
8155
8156 p = s;
8157 end = s + length;
8158 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 register Py_UNICODE ch = *p;
8160 int decimal;
8161 PyObject *repunicode;
8162 Py_ssize_t repsize;
8163 Py_ssize_t newpos;
8164 Py_UNICODE *uni2;
8165 Py_UNICODE *collstart;
8166 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008167
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 ++p;
8171 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 decimal = Py_UNICODE_TODECIMAL(ch);
8174 if (decimal >= 0) {
8175 *output++ = '0' + decimal;
8176 ++p;
8177 continue;
8178 }
8179 if (0 < ch && ch < 256) {
8180 *output++ = (char)ch;
8181 ++p;
8182 continue;
8183 }
8184 /* All other characters are considered unencodable */
8185 collstart = p;
8186 collend = p+1;
8187 while (collend < end) {
8188 if ((0 < *collend && *collend < 256) ||
8189 !Py_UNICODE_ISSPACE(*collend) ||
8190 Py_UNICODE_TODECIMAL(*collend))
8191 break;
8192 }
8193 /* cache callback name lookup
8194 * (if not done yet, i.e. it's the first error) */
8195 if (known_errorHandler==-1) {
8196 if ((errors==NULL) || (!strcmp(errors, "strict")))
8197 known_errorHandler = 1;
8198 else if (!strcmp(errors, "replace"))
8199 known_errorHandler = 2;
8200 else if (!strcmp(errors, "ignore"))
8201 known_errorHandler = 3;
8202 else if (!strcmp(errors, "xmlcharrefreplace"))
8203 known_errorHandler = 4;
8204 else
8205 known_errorHandler = 0;
8206 }
8207 switch (known_errorHandler) {
8208 case 1: /* strict */
8209 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8210 goto onError;
8211 case 2: /* replace */
8212 for (p = collstart; p < collend; ++p)
8213 *output++ = '?';
8214 /* fall through */
8215 case 3: /* ignore */
8216 p = collend;
8217 break;
8218 case 4: /* xmlcharrefreplace */
8219 /* generate replacement (temporarily (mis)uses p) */
8220 for (p = collstart; p < collend; ++p)
8221 output += sprintf(output, "&#%d;", (int)*p);
8222 p = collend;
8223 break;
8224 default:
8225 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8226 encoding, reason, s, length, &exc,
8227 collstart-s, collend-s, &newpos);
8228 if (repunicode == NULL)
8229 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008230 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008231 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008232 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8233 Py_DECREF(repunicode);
8234 goto onError;
8235 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 /* generate replacement */
8237 repsize = PyUnicode_GET_SIZE(repunicode);
8238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8239 Py_UNICODE ch = *uni2;
8240 if (Py_UNICODE_ISSPACE(ch))
8241 *output++ = ' ';
8242 else {
8243 decimal = Py_UNICODE_TODECIMAL(ch);
8244 if (decimal >= 0)
8245 *output++ = '0' + decimal;
8246 else if (0 < ch && ch < 256)
8247 *output++ = (char)ch;
8248 else {
8249 Py_DECREF(repunicode);
8250 raise_encode_exception(&exc, encoding,
8251 s, length, collstart-s, collend-s, reason);
8252 goto onError;
8253 }
8254 }
8255 }
8256 p = s + newpos;
8257 Py_DECREF(repunicode);
8258 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008259 }
8260 /* 0-terminate the output string */
8261 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 Py_XDECREF(exc);
8263 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008264 return 0;
8265
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 Py_XDECREF(exc);
8268 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008269 return -1;
8270}
8271
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272/* --- Helpers ------------------------------------------------------------ */
8273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274#include "stringlib/ucs1lib.h"
8275#include "stringlib/fastsearch.h"
8276#include "stringlib/partition.h"
8277#include "stringlib/split.h"
8278#include "stringlib/count.h"
8279#include "stringlib/find.h"
8280#include "stringlib/localeutil.h"
8281#include "stringlib/undef.h"
8282
8283#include "stringlib/ucs2lib.h"
8284#include "stringlib/fastsearch.h"
8285#include "stringlib/partition.h"
8286#include "stringlib/split.h"
8287#include "stringlib/count.h"
8288#include "stringlib/find.h"
8289#include "stringlib/localeutil.h"
8290#include "stringlib/undef.h"
8291
8292#include "stringlib/ucs4lib.h"
8293#include "stringlib/fastsearch.h"
8294#include "stringlib/partition.h"
8295#include "stringlib/split.h"
8296#include "stringlib/count.h"
8297#include "stringlib/find.h"
8298#include "stringlib/localeutil.h"
8299#include "stringlib/undef.h"
8300
8301static Py_ssize_t
8302any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8303 const Py_UCS1*, Py_ssize_t,
8304 Py_ssize_t, Py_ssize_t),
8305 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8306 const Py_UCS2*, Py_ssize_t,
8307 Py_ssize_t, Py_ssize_t),
8308 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8309 const Py_UCS4*, Py_ssize_t,
8310 Py_ssize_t, Py_ssize_t),
8311 PyObject* s1, PyObject* s2,
8312 Py_ssize_t start,
8313 Py_ssize_t end)
8314{
8315 int kind1, kind2, kind;
8316 void *buf1, *buf2;
8317 Py_ssize_t len1, len2, result;
8318
8319 kind1 = PyUnicode_KIND(s1);
8320 kind2 = PyUnicode_KIND(s2);
8321 kind = kind1 > kind2 ? kind1 : kind2;
8322 buf1 = PyUnicode_DATA(s1);
8323 buf2 = PyUnicode_DATA(s2);
8324 if (kind1 != kind)
8325 buf1 = _PyUnicode_AsKind(s1, kind);
8326 if (!buf1)
8327 return -2;
8328 if (kind2 != kind)
8329 buf2 = _PyUnicode_AsKind(s2, kind);
8330 if (!buf2) {
8331 if (kind1 != kind) PyMem_Free(buf1);
8332 return -2;
8333 }
8334 len1 = PyUnicode_GET_LENGTH(s1);
8335 len2 = PyUnicode_GET_LENGTH(s2);
8336
8337 switch(kind) {
8338 case PyUnicode_1BYTE_KIND:
8339 result = ucs1(buf1, len1, buf2, len2, start, end);
8340 break;
8341 case PyUnicode_2BYTE_KIND:
8342 result = ucs2(buf1, len1, buf2, len2, start, end);
8343 break;
8344 case PyUnicode_4BYTE_KIND:
8345 result = ucs4(buf1, len1, buf2, len2, start, end);
8346 break;
8347 default:
8348 assert(0); result = -2;
8349 }
8350
8351 if (kind1 != kind)
8352 PyMem_Free(buf1);
8353 if (kind2 != kind)
8354 PyMem_Free(buf2);
8355
8356 return result;
8357}
8358
8359Py_ssize_t
8360_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8361 Py_ssize_t n_buffer,
8362 void *digits, Py_ssize_t n_digits,
8363 Py_ssize_t min_width,
8364 const char *grouping,
8365 const char *thousands_sep)
8366{
8367 switch(kind) {
8368 case PyUnicode_1BYTE_KIND:
8369 return _PyUnicode_ucs1_InsertThousandsGrouping(
8370 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8371 min_width, grouping, thousands_sep);
8372 case PyUnicode_2BYTE_KIND:
8373 return _PyUnicode_ucs2_InsertThousandsGrouping(
8374 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8375 min_width, grouping, thousands_sep);
8376 case PyUnicode_4BYTE_KIND:
8377 return _PyUnicode_ucs4_InsertThousandsGrouping(
8378 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8379 min_width, grouping, thousands_sep);
8380 }
8381 assert(0);
8382 return -1;
8383}
8384
8385
Eric Smith8c663262007-08-25 02:26:07 +00008386#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008388
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389#include "stringlib/count.h"
8390#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008391
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008393#define ADJUST_INDICES(start, end, len) \
8394 if (end > len) \
8395 end = len; \
8396 else if (end < 0) { \
8397 end += len; \
8398 if (end < 0) \
8399 end = 0; \
8400 } \
8401 if (start < 0) { \
8402 start += len; \
8403 if (start < 0) \
8404 start = 0; \
8405 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008406
Alexander Belopolsky40018472011-02-26 01:02:56 +00008407Py_ssize_t
8408PyUnicode_Count(PyObject *str,
8409 PyObject *substr,
8410 Py_ssize_t start,
8411 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008413 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008414 PyUnicodeObject* str_obj;
8415 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 int kind1, kind2, kind;
8417 void *buf1 = NULL, *buf2 = NULL;
8418 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008419
Thomas Wouters477c8d52006-05-27 19:21:47 +00008420 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008423 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008424 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 Py_DECREF(str_obj);
8426 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 }
Tim Petersced69f82003-09-16 20:30:58 +00008428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 kind1 = PyUnicode_KIND(str_obj);
8430 kind2 = PyUnicode_KIND(sub_obj);
8431 kind = kind1 > kind2 ? kind1 : kind2;
8432 buf1 = PyUnicode_DATA(str_obj);
8433 if (kind1 != kind)
8434 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8435 if (!buf1)
8436 goto onError;
8437 buf2 = PyUnicode_DATA(sub_obj);
8438 if (kind2 != kind)
8439 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8440 if (!buf2)
8441 goto onError;
8442 len1 = PyUnicode_GET_LENGTH(str_obj);
8443 len2 = PyUnicode_GET_LENGTH(sub_obj);
8444
8445 ADJUST_INDICES(start, end, len1);
8446 switch(kind) {
8447 case PyUnicode_1BYTE_KIND:
8448 result = ucs1lib_count(
8449 ((Py_UCS1*)buf1) + start, end - start,
8450 buf2, len2, PY_SSIZE_T_MAX
8451 );
8452 break;
8453 case PyUnicode_2BYTE_KIND:
8454 result = ucs2lib_count(
8455 ((Py_UCS2*)buf1) + start, end - start,
8456 buf2, len2, PY_SSIZE_T_MAX
8457 );
8458 break;
8459 case PyUnicode_4BYTE_KIND:
8460 result = ucs4lib_count(
8461 ((Py_UCS4*)buf1) + start, end - start,
8462 buf2, len2, PY_SSIZE_T_MAX
8463 );
8464 break;
8465 default:
8466 assert(0); result = 0;
8467 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008468
8469 Py_DECREF(sub_obj);
8470 Py_DECREF(str_obj);
8471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 if (kind1 != kind)
8473 PyMem_Free(buf1);
8474 if (kind2 != kind)
8475 PyMem_Free(buf2);
8476
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 onError:
8479 Py_DECREF(sub_obj);
8480 Py_DECREF(str_obj);
8481 if (kind1 != kind && buf1)
8482 PyMem_Free(buf1);
8483 if (kind2 != kind && buf2)
8484 PyMem_Free(buf2);
8485 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486}
8487
Alexander Belopolsky40018472011-02-26 01:02:56 +00008488Py_ssize_t
8489PyUnicode_Find(PyObject *str,
8490 PyObject *sub,
8491 Py_ssize_t start,
8492 Py_ssize_t end,
8493 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008495 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008496
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008500 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 Py_DECREF(str);
8503 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 }
Tim Petersced69f82003-09-16 20:30:58 +00008505
Thomas Wouters477c8d52006-05-27 19:21:47 +00008506 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 result = any_find_slice(
8508 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8509 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008510 );
8511 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 result = any_find_slice(
8513 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8514 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008515 );
8516
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518 Py_DECREF(sub);
8519
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 return result;
8521}
8522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523Py_ssize_t
8524PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8525 Py_ssize_t start, Py_ssize_t end,
8526 int direction)
8527{
8528 char *result;
8529 int kind;
8530 if (PyUnicode_READY(str) == -1)
8531 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008532 if (start < 0 || end < 0) {
8533 PyErr_SetString(PyExc_IndexError, "string index out of range");
8534 return -2;
8535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 if (end > PyUnicode_GET_LENGTH(str))
8537 end = PyUnicode_GET_LENGTH(str);
8538 kind = PyUnicode_KIND(str);
8539 result = findchar(PyUnicode_1BYTE_DATA(str)
8540 + PyUnicode_KIND_SIZE(kind, start),
8541 kind,
8542 end-start, ch, direction);
8543 if (!result)
8544 return -1;
8545 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548static int
8549tailmatch(PyUnicodeObject *self,
8550 PyUnicodeObject *substring,
8551 Py_ssize_t start,
8552 Py_ssize_t end,
8553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 int kind_self;
8556 int kind_sub;
8557 void *data_self;
8558 void *data_sub;
8559 Py_ssize_t offset;
8560 Py_ssize_t i;
8561 Py_ssize_t end_sub;
8562
8563 if (PyUnicode_READY(self) == -1 ||
8564 PyUnicode_READY(substring) == -1)
8565 return 0;
8566
8567 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 return 1;
8569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8571 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 kind_self = PyUnicode_KIND(self);
8576 data_self = PyUnicode_DATA(self);
8577 kind_sub = PyUnicode_KIND(substring);
8578 data_sub = PyUnicode_DATA(substring);
8579 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8580
8581 if (direction > 0)
8582 offset = end;
8583 else
8584 offset = start;
8585
8586 if (PyUnicode_READ(kind_self, data_self, offset) ==
8587 PyUnicode_READ(kind_sub, data_sub, 0) &&
8588 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8589 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8590 /* If both are of the same kind, memcmp is sufficient */
8591 if (kind_self == kind_sub) {
8592 return ! memcmp((char *)data_self +
8593 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8594 data_sub,
8595 PyUnicode_GET_LENGTH(substring) *
8596 PyUnicode_CHARACTER_SIZE(substring));
8597 }
8598 /* otherwise we have to compare each character by first accesing it */
8599 else {
8600 /* We do not need to compare 0 and len(substring)-1 because
8601 the if statement above ensured already that they are equal
8602 when we end up here. */
8603 // TODO: honor direction and do a forward or backwards search
8604 for (i = 1; i < end_sub; ++i) {
8605 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8606 PyUnicode_READ(kind_sub, data_sub, i))
8607 return 0;
8608 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 }
8612
8613 return 0;
8614}
8615
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616Py_ssize_t
8617PyUnicode_Tailmatch(PyObject *str,
8618 PyObject *substr,
8619 Py_ssize_t start,
8620 Py_ssize_t end,
8621 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008623 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008624
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 str = PyUnicode_FromObject(str);
8626 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 substr = PyUnicode_FromObject(substr);
8629 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 Py_DECREF(str);
8631 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 }
Tim Petersced69f82003-09-16 20:30:58 +00008633
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 (PyUnicodeObject *)substr,
8636 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 Py_DECREF(str);
8638 Py_DECREF(substr);
8639 return result;
8640}
8641
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642/* Apply fixfct filter to the Unicode object self and return a
8643 reference to the modified object */
8644
Alexander Belopolsky40018472011-02-26 01:02:56 +00008645static PyObject *
8646fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 PyObject *u;
8650 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 if (PyUnicode_READY(self) == -1)
8653 return NULL;
8654 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8655 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8656 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8661 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 /* fix functions return the new maximum character in a string,
8664 if the kind of the resulting unicode object does not change,
8665 everything is fine. Otherwise we need to change the string kind
8666 and re-run the fix function. */
8667 maxchar_new = fixfct((PyUnicodeObject*)u);
8668 if (maxchar_new == 0)
8669 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8670 else if (maxchar_new <= 127)
8671 maxchar_new = 127;
8672 else if (maxchar_new <= 255)
8673 maxchar_new = 255;
8674 else if (maxchar_new <= 65535)
8675 maxchar_new = 65535;
8676 else
8677 maxchar_new = 1114111; /* 0x10ffff */
8678
8679 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 /* fixfct should return TRUE if it modified the buffer. If
8681 FALSE, return a reference to the original buffer instead
8682 (to save space, not time) */
8683 Py_INCREF(self);
8684 Py_DECREF(u);
8685 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 else if (maxchar_new == maxchar_old) {
8688 return u;
8689 }
8690 else {
8691 /* In case the maximum character changed, we need to
8692 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008693 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 if (v == NULL) {
8695 Py_DECREF(u);
8696 return NULL;
8697 }
8698 if (maxchar_new > maxchar_old) {
8699 /* If the maxchar increased so that the kind changed, not all
8700 characters are representable anymore and we need to fix the
8701 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008702 if (PyUnicode_CopyCharacters(v, 0,
8703 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008704 PyUnicode_GET_LENGTH(self)) < 0)
8705 {
8706 Py_DECREF(u);
8707 return NULL;
8708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 maxchar_old = fixfct((PyUnicodeObject*)v);
8710 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8711 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008712 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008713 if (PyUnicode_CopyCharacters(v, 0,
8714 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008715 PyUnicode_GET_LENGTH(self)) < 0)
8716 {
8717 Py_DECREF(u);
8718 return NULL;
8719 }
8720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721
8722 Py_DECREF(u);
8723 return v;
8724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725}
8726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008728fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 /* No need to call PyUnicode_READY(self) because this function is only
8731 called as a callback from fixup() which does it already. */
8732 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8733 const int kind = PyUnicode_KIND(self);
8734 void *data = PyUnicode_DATA(self);
8735 int touched = 0;
8736 Py_UCS4 maxchar = 0;
8737 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 for (i = 0; i < len; ++i) {
8740 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8741 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8742 if (up != ch) {
8743 if (up > maxchar)
8744 maxchar = up;
8745 PyUnicode_WRITE(kind, data, i, up);
8746 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 else if (ch > maxchar)
8749 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 }
8751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 if (touched)
8753 return maxchar;
8754 else
8755 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756}
8757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008759fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8762 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8763 const int kind = PyUnicode_KIND(self);
8764 void *data = PyUnicode_DATA(self);
8765 int touched = 0;
8766 Py_UCS4 maxchar = 0;
8767 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 for(i = 0; i < len; ++i) {
8770 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8771 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8772 if (lo != ch) {
8773 if (lo > maxchar)
8774 maxchar = lo;
8775 PyUnicode_WRITE(kind, data, i, lo);
8776 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 else if (ch > maxchar)
8779 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 }
8781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 if (touched)
8783 return maxchar;
8784 else
8785 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786}
8787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008789fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8792 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8793 const int kind = PyUnicode_KIND(self);
8794 void *data = PyUnicode_DATA(self);
8795 int touched = 0;
8796 Py_UCS4 maxchar = 0;
8797 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 for(i = 0; i < len; ++i) {
8800 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8801 Py_UCS4 nu = 0;
8802
8803 if (Py_UNICODE_ISUPPER(ch))
8804 nu = Py_UNICODE_TOLOWER(ch);
8805 else if (Py_UNICODE_ISLOWER(ch))
8806 nu = Py_UNICODE_TOUPPER(ch);
8807
8808 if (nu != 0) {
8809 if (nu > maxchar)
8810 maxchar = nu;
8811 PyUnicode_WRITE(kind, data, i, nu);
8812 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 else if (ch > maxchar)
8815 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 }
8817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 if (touched)
8819 return maxchar;
8820 else
8821 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822}
8823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008825fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8828 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8829 const int kind = PyUnicode_KIND(self);
8830 void *data = PyUnicode_DATA(self);
8831 int touched = 0;
8832 Py_UCS4 maxchar = 0;
8833 Py_ssize_t i = 0;
8834 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008835
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008836 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838
8839 ch = PyUnicode_READ(kind, data, i);
8840 if (!Py_UNICODE_ISUPPER(ch)) {
8841 maxchar = Py_UNICODE_TOUPPER(ch);
8842 PyUnicode_WRITE(kind, data, i, maxchar);
8843 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 ++i;
8846 for(; i < len; ++i) {
8847 ch = PyUnicode_READ(kind, data, i);
8848 if (!Py_UNICODE_ISLOWER(ch)) {
8849 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8850 if (lo > maxchar)
8851 maxchar = lo;
8852 PyUnicode_WRITE(kind, data, i, lo);
8853 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 else if (ch > maxchar)
8856 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858
8859 if (touched)
8860 return maxchar;
8861 else
8862 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863}
8864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008866fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8869 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8870 const int kind = PyUnicode_KIND(self);
8871 void *data = PyUnicode_DATA(self);
8872 Py_UCS4 maxchar = 0;
8873 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 int previous_is_cased;
8875
8876 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 if (len == 1) {
8878 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8879 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8880 if (ti != ch) {
8881 PyUnicode_WRITE(kind, data, i, ti);
8882 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 }
8884 else
8885 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 for(; i < len; ++i) {
8889 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8890 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008891
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 nu = Py_UNICODE_TOTITLE(ch);
8896
8897 if (nu > maxchar)
8898 maxchar = nu;
8899 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008900
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 if (Py_UNICODE_ISLOWER(ch) ||
8902 Py_UNICODE_ISUPPER(ch) ||
8903 Py_UNICODE_ISTITLE(ch))
8904 previous_is_cased = 1;
8905 else
8906 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909}
8910
Tim Peters8ce9f162004-08-27 01:49:32 +00008911PyObject *
8912PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008915 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008917 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008918 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8919 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008920 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 Py_ssize_t sz, i, res_offset;
8922 Py_UCS4 maxchar = 0;
8923 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924
Tim Peters05eba1f2004-08-27 21:32:02 +00008925 fseq = PySequence_Fast(seq, "");
8926 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008927 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008928 }
8929
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008930 /* NOTE: the following code can't call back into Python code,
8931 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008932 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008933
Tim Peters05eba1f2004-08-27 21:32:02 +00008934 seqlen = PySequence_Fast_GET_SIZE(fseq);
8935 /* If empty sequence, return u"". */
8936 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008938 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008939 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008940 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008941 /* If singleton sequence with an exact Unicode, return that. */
8942 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 item = items[0];
8944 if (PyUnicode_CheckExact(item)) {
8945 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 goto Done;
8948 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008949 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008950 else {
8951 /* Set up sep and seplen */
8952 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 /* fall back to a blank space separator */
8954 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008955 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008957 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008958 else {
8959 if (!PyUnicode_Check(separator)) {
8960 PyErr_Format(PyExc_TypeError,
8961 "separator: expected str instance,"
8962 " %.80s found",
8963 Py_TYPE(separator)->tp_name);
8964 goto onError;
8965 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008966 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 goto onError;
8968 sep = separator;
8969 seplen = PyUnicode_GET_LENGTH(separator);
8970 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8971 /* inc refcount to keep this code path symetric with the
8972 above case of a blank separator */
8973 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008974 }
8975 }
8976
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008977 /* There are at least two things to join, or else we have a subclass
8978 * of str in the sequence.
8979 * Do a pre-pass to figure out the total amount of space we'll
8980 * need (sz), and see whether all argument are strings.
8981 */
8982 sz = 0;
8983 for (i = 0; i < seqlen; i++) {
8984 const Py_ssize_t old_sz = sz;
8985 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 if (!PyUnicode_Check(item)) {
8987 PyErr_Format(PyExc_TypeError,
8988 "sequence item %zd: expected str instance,"
8989 " %.80s found",
8990 i, Py_TYPE(item)->tp_name);
8991 goto onError;
8992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 if (PyUnicode_READY(item) == -1)
8994 goto onError;
8995 sz += PyUnicode_GET_LENGTH(item);
8996 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8997 if (item_maxchar > maxchar)
8998 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008999 if (i != 0)
9000 sz += seplen;
9001 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9002 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009004 goto onError;
9005 }
9006 }
Tim Petersced69f82003-09-16 20:30:58 +00009007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009009 if (res == NULL)
9010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009011
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009012 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009014 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009015 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009017 if (i && seplen != 0) {
9018 copied = PyUnicode_CopyCharacters(res, res_offset,
9019 sep, 0, seplen);
9020 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009021 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009022#ifdef Py_DEBUG
9023 res_offset += copied;
9024#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009026#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009028 itemlen = PyUnicode_GET_LENGTH(item);
9029 if (itemlen != 0) {
9030 copied = PyUnicode_CopyCharacters(res, res_offset,
9031 item, 0, itemlen);
9032 if (copied < 0)
9033 goto onError;
9034#ifdef Py_DEBUG
9035 res_offset += copied;
9036#else
9037 res_offset += itemlen;
9038#endif
9039 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009042
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009044 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 Py_XDECREF(sep);
9046 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009049 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009051 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 return NULL;
9053}
9054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055#define FILL(kind, data, value, start, length) \
9056 do { \
9057 Py_ssize_t i_ = 0; \
9058 assert(kind != PyUnicode_WCHAR_KIND); \
9059 switch ((kind)) { \
9060 case PyUnicode_1BYTE_KIND: { \
9061 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9062 memset(to_, (unsigned char)value, length); \
9063 break; \
9064 } \
9065 case PyUnicode_2BYTE_KIND: { \
9066 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9067 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9068 break; \
9069 } \
9070 default: { \
9071 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9072 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9073 break; \
9074 } \
9075 } \
9076 } while (0)
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078static PyUnicodeObject *
9079pad(PyUnicodeObject *self,
9080 Py_ssize_t left,
9081 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 PyObject *u;
9085 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009086 int kind;
9087 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088
9089 if (left < 0)
9090 left = 0;
9091 if (right < 0)
9092 right = 0;
9093
Tim Peters7a29bd52001-09-12 03:03:31 +00009094 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 Py_INCREF(self);
9096 return self;
9097 }
9098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9100 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009101 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9102 return NULL;
9103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9105 if (fill > maxchar)
9106 maxchar = fill;
9107 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009108 if (!u)
9109 return NULL;
9110
9111 kind = PyUnicode_KIND(u);
9112 data = PyUnicode_DATA(u);
9113 if (left)
9114 FILL(kind, data, fill, 0, left);
9115 if (right)
9116 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009117 if (PyUnicode_CopyCharacters(u, left,
9118 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009119 _PyUnicode_LENGTH(self)) < 0)
9120 {
9121 Py_DECREF(u);
9122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 }
9124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129PyObject *
9130PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
9134 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 switch(PyUnicode_KIND(string)) {
9139 case PyUnicode_1BYTE_KIND:
9140 list = ucs1lib_splitlines(
9141 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9142 PyUnicode_GET_LENGTH(string), keepends);
9143 break;
9144 case PyUnicode_2BYTE_KIND:
9145 list = ucs2lib_splitlines(
9146 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9147 PyUnicode_GET_LENGTH(string), keepends);
9148 break;
9149 case PyUnicode_4BYTE_KIND:
9150 list = ucs4lib_splitlines(
9151 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9152 PyUnicode_GET_LENGTH(string), keepends);
9153 break;
9154 default:
9155 assert(0);
9156 list = 0;
9157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 Py_DECREF(string);
9159 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160}
9161
Alexander Belopolsky40018472011-02-26 01:02:56 +00009162static PyObject *
9163split(PyUnicodeObject *self,
9164 PyUnicodeObject *substring,
9165 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 int kind1, kind2, kind;
9168 void *buf1, *buf2;
9169 Py_ssize_t len1, len2;
9170 PyObject* out;
9171
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009173 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 if (PyUnicode_READY(self) == -1)
9176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 if (substring == NULL)
9179 switch(PyUnicode_KIND(self)) {
9180 case PyUnicode_1BYTE_KIND:
9181 return ucs1lib_split_whitespace(
9182 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9183 PyUnicode_GET_LENGTH(self), maxcount
9184 );
9185 case PyUnicode_2BYTE_KIND:
9186 return ucs2lib_split_whitespace(
9187 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9188 PyUnicode_GET_LENGTH(self), maxcount
9189 );
9190 case PyUnicode_4BYTE_KIND:
9191 return ucs4lib_split_whitespace(
9192 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9193 PyUnicode_GET_LENGTH(self), maxcount
9194 );
9195 default:
9196 assert(0);
9197 return NULL;
9198 }
9199
9200 if (PyUnicode_READY(substring) == -1)
9201 return NULL;
9202
9203 kind1 = PyUnicode_KIND(self);
9204 kind2 = PyUnicode_KIND(substring);
9205 kind = kind1 > kind2 ? kind1 : kind2;
9206 buf1 = PyUnicode_DATA(self);
9207 buf2 = PyUnicode_DATA(substring);
9208 if (kind1 != kind)
9209 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9210 if (!buf1)
9211 return NULL;
9212 if (kind2 != kind)
9213 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9214 if (!buf2) {
9215 if (kind1 != kind) PyMem_Free(buf1);
9216 return NULL;
9217 }
9218 len1 = PyUnicode_GET_LENGTH(self);
9219 len2 = PyUnicode_GET_LENGTH(substring);
9220
9221 switch(kind) {
9222 case PyUnicode_1BYTE_KIND:
9223 out = ucs1lib_split(
9224 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9225 break;
9226 case PyUnicode_2BYTE_KIND:
9227 out = ucs2lib_split(
9228 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9229 break;
9230 case PyUnicode_4BYTE_KIND:
9231 out = ucs4lib_split(
9232 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9233 break;
9234 default:
9235 out = NULL;
9236 }
9237 if (kind1 != kind)
9238 PyMem_Free(buf1);
9239 if (kind2 != kind)
9240 PyMem_Free(buf2);
9241 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242}
9243
Alexander Belopolsky40018472011-02-26 01:02:56 +00009244static PyObject *
9245rsplit(PyUnicodeObject *self,
9246 PyUnicodeObject *substring,
9247 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 int kind1, kind2, kind;
9250 void *buf1, *buf2;
9251 Py_ssize_t len1, len2;
9252 PyObject* out;
9253
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009254 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009255 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (PyUnicode_READY(self) == -1)
9258 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 if (substring == NULL)
9261 switch(PyUnicode_KIND(self)) {
9262 case PyUnicode_1BYTE_KIND:
9263 return ucs1lib_rsplit_whitespace(
9264 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9265 PyUnicode_GET_LENGTH(self), maxcount
9266 );
9267 case PyUnicode_2BYTE_KIND:
9268 return ucs2lib_rsplit_whitespace(
9269 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9270 PyUnicode_GET_LENGTH(self), maxcount
9271 );
9272 case PyUnicode_4BYTE_KIND:
9273 return ucs4lib_rsplit_whitespace(
9274 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9275 PyUnicode_GET_LENGTH(self), maxcount
9276 );
9277 default:
9278 assert(0);
9279 return NULL;
9280 }
9281
9282 if (PyUnicode_READY(substring) == -1)
9283 return NULL;
9284
9285 kind1 = PyUnicode_KIND(self);
9286 kind2 = PyUnicode_KIND(substring);
9287 kind = kind1 > kind2 ? kind1 : kind2;
9288 buf1 = PyUnicode_DATA(self);
9289 buf2 = PyUnicode_DATA(substring);
9290 if (kind1 != kind)
9291 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9292 if (!buf1)
9293 return NULL;
9294 if (kind2 != kind)
9295 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9296 if (!buf2) {
9297 if (kind1 != kind) PyMem_Free(buf1);
9298 return NULL;
9299 }
9300 len1 = PyUnicode_GET_LENGTH(self);
9301 len2 = PyUnicode_GET_LENGTH(substring);
9302
9303 switch(kind) {
9304 case PyUnicode_1BYTE_KIND:
9305 out = ucs1lib_rsplit(
9306 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9307 break;
9308 case PyUnicode_2BYTE_KIND:
9309 out = ucs2lib_rsplit(
9310 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9311 break;
9312 case PyUnicode_4BYTE_KIND:
9313 out = ucs4lib_rsplit(
9314 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9315 break;
9316 default:
9317 out = NULL;
9318 }
9319 if (kind1 != kind)
9320 PyMem_Free(buf1);
9321 if (kind2 != kind)
9322 PyMem_Free(buf2);
9323 return out;
9324}
9325
9326static Py_ssize_t
9327anylib_find(int kind, void *buf1, Py_ssize_t len1,
9328 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9329{
9330 switch(kind) {
9331 case PyUnicode_1BYTE_KIND:
9332 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9333 case PyUnicode_2BYTE_KIND:
9334 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9335 case PyUnicode_4BYTE_KIND:
9336 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9337 }
9338 assert(0);
9339 return -1;
9340}
9341
9342static Py_ssize_t
9343anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9344 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9345{
9346 switch(kind) {
9347 case PyUnicode_1BYTE_KIND:
9348 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9349 case PyUnicode_2BYTE_KIND:
9350 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9351 case PyUnicode_4BYTE_KIND:
9352 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9353 }
9354 assert(0);
9355 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009356}
9357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359replace(PyObject *self, PyObject *str1,
9360 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 PyObject *u;
9363 char *sbuf = PyUnicode_DATA(self);
9364 char *buf1 = PyUnicode_DATA(str1);
9365 char *buf2 = PyUnicode_DATA(str2);
9366 int srelease = 0, release1 = 0, release2 = 0;
9367 int skind = PyUnicode_KIND(self);
9368 int kind1 = PyUnicode_KIND(str1);
9369 int kind2 = PyUnicode_KIND(str2);
9370 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9371 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9372 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373
9374 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009377 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 if (skind < kind1)
9380 /* substring too wide to be present */
9381 goto nothing;
9382
9383 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009384 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009385 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009387 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009389 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 Py_UCS4 u1, u2, maxchar;
9391 int mayshrink, rkind;
9392 u1 = PyUnicode_READ_CHAR(str1, 0);
9393 if (!findchar(sbuf, PyUnicode_KIND(self),
9394 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 u2 = PyUnicode_READ_CHAR(str2, 0);
9397 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9398 /* Replacing u1 with u2 may cause a maxchar reduction in the
9399 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (u2 > maxchar) {
9401 maxchar = u2;
9402 mayshrink = 0;
9403 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009404 else
9405 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009409 if (PyUnicode_CopyCharacters(u, 0,
9410 (PyObject*)self, 0, slen) < 0)
9411 {
9412 Py_DECREF(u);
9413 return NULL;
9414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 rkind = PyUnicode_KIND(u);
9416 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9417 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009418 if (--maxcount < 0)
9419 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 if (mayshrink) {
9423 PyObject *tmp = u;
9424 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9425 PyUnicode_GET_LENGTH(tmp));
9426 Py_DECREF(tmp);
9427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 int rkind = skind;
9430 char *res;
9431 if (kind1 < rkind) {
9432 /* widen substring */
9433 buf1 = _PyUnicode_AsKind(str1, rkind);
9434 if (!buf1) goto error;
9435 release1 = 1;
9436 }
9437 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009438 if (i < 0)
9439 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 if (rkind > kind2) {
9441 /* widen replacement */
9442 buf2 = _PyUnicode_AsKind(str2, rkind);
9443 if (!buf2) goto error;
9444 release2 = 1;
9445 }
9446 else if (rkind < kind2) {
9447 /* widen self and buf1 */
9448 rkind = kind2;
9449 if (release1) PyMem_Free(buf1);
9450 sbuf = _PyUnicode_AsKind(self, rkind);
9451 if (!sbuf) goto error;
9452 srelease = 1;
9453 buf1 = _PyUnicode_AsKind(str1, rkind);
9454 if (!buf1) goto error;
9455 release1 = 1;
9456 }
9457 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9458 if (!res) {
9459 PyErr_NoMemory();
9460 goto error;
9461 }
9462 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009463 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9465 buf2,
9466 PyUnicode_KIND_SIZE(rkind, len2));
9467 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009468
9469 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9471 slen-i,
9472 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009473 if (i == -1)
9474 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9476 buf2,
9477 PyUnicode_KIND_SIZE(rkind, len2));
9478 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480
9481 u = PyUnicode_FromKindAndData(rkind, res, slen);
9482 PyMem_Free(res);
9483 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 Py_ssize_t n, i, j, ires;
9488 Py_ssize_t product, new_size;
9489 int rkind = skind;
9490 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 if (kind1 < rkind) {
9493 buf1 = _PyUnicode_AsKind(str1, rkind);
9494 if (!buf1) goto error;
9495 release1 = 1;
9496 }
9497 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009498 if (n == 0)
9499 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 if (kind2 < rkind) {
9501 buf2 = _PyUnicode_AsKind(str2, rkind);
9502 if (!buf2) goto error;
9503 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 else if (kind2 > rkind) {
9506 rkind = kind2;
9507 sbuf = _PyUnicode_AsKind(self, rkind);
9508 if (!sbuf) goto error;
9509 srelease = 1;
9510 if (release1) PyMem_Free(buf1);
9511 buf1 = _PyUnicode_AsKind(str1, rkind);
9512 if (!buf1) goto error;
9513 release1 = 1;
9514 }
9515 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9516 PyUnicode_GET_LENGTH(str1))); */
9517 product = n * (len2-len1);
9518 if ((product / (len2-len1)) != n) {
9519 PyErr_SetString(PyExc_OverflowError,
9520 "replace string is too long");
9521 goto error;
9522 }
9523 new_size = slen + product;
9524 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9525 PyErr_SetString(PyExc_OverflowError,
9526 "replace string is too long");
9527 goto error;
9528 }
9529 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9530 if (!res)
9531 goto error;
9532 ires = i = 0;
9533 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009534 while (n-- > 0) {
9535 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 j = anylib_find(rkind,
9537 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9538 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009539 if (j == -1)
9540 break;
9541 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009542 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9544 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9545 PyUnicode_KIND_SIZE(rkind, j-i));
9546 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009547 }
9548 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 if (len2 > 0) {
9550 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9551 buf2,
9552 PyUnicode_KIND_SIZE(rkind, len2));
9553 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009558 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9560 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9561 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009562 } else {
9563 /* interleave */
9564 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9566 buf2,
9567 PyUnicode_KIND_SIZE(rkind, len2));
9568 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009569 if (--n <= 0)
9570 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9572 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9573 PyUnicode_KIND_SIZE(rkind, 1));
9574 ires++;
9575 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9578 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9579 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009582 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 if (srelease)
9585 PyMem_FREE(sbuf);
9586 if (release1)
9587 PyMem_FREE(buf1);
9588 if (release2)
9589 PyMem_FREE(buf2);
9590 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009591
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009593 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 if (srelease)
9595 PyMem_FREE(sbuf);
9596 if (release1)
9597 PyMem_FREE(buf1);
9598 if (release2)
9599 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009600 if (PyUnicode_CheckExact(self)) {
9601 Py_INCREF(self);
9602 return (PyObject *) self;
9603 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009604 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 error:
9606 if (srelease && sbuf)
9607 PyMem_FREE(sbuf);
9608 if (release1 && buf1)
9609 PyMem_FREE(buf1);
9610 if (release2 && buf2)
9611 PyMem_FREE(buf2);
9612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613}
9614
9615/* --- Unicode Object Methods --------------------------------------------- */
9616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009617PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619\n\
9620Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009621characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
9623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009624unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 return fixup(self, fixtitle);
9627}
9628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009629PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009630 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631\n\
9632Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009633have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634
9635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009636unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 return fixup(self, fixcapitalize);
9639}
9640
9641#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009642PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644\n\
9645Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009646normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647
9648static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009649unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650{
9651 PyObject *list;
9652 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009653 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 /* Split into words */
9656 list = split(self, NULL, -1);
9657 if (!list)
9658 return NULL;
9659
9660 /* Capitalize each word */
9661 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9662 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 if (item == NULL)
9665 goto onError;
9666 Py_DECREF(PyList_GET_ITEM(list, i));
9667 PyList_SET_ITEM(list, i, item);
9668 }
9669
9670 /* Join the words to form a new string */
9671 item = PyUnicode_Join(NULL, list);
9672
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674 Py_DECREF(list);
9675 return (PyObject *)item;
9676}
9677#endif
9678
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009679/* Argument converter. Coerces to a single unicode character */
9680
9681static int
9682convert_uc(PyObject *obj, void *addr)
9683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009686
Benjamin Peterson14339b62009-01-31 16:36:08 +00009687 uniobj = PyUnicode_FromObject(obj);
9688 if (uniobj == NULL) {
9689 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009691 return 0;
9692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 Py_DECREF(uniobj);
9697 return 0;
9698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009700 Py_DECREF(uniobj);
9701 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009702}
9703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009704PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009705 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009707Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009708done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709
9710static PyObject *
9711unicode_center(PyUnicodeObject *self, PyObject *args)
9712{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009713 Py_ssize_t marg, left;
9714 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 Py_UCS4 fillchar = ' ';
9716
Victor Stinnere9a29352011-10-01 02:14:59 +02009717 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719
Victor Stinnere9a29352011-10-01 02:14:59 +02009720 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721 return NULL;
9722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 Py_INCREF(self);
9725 return (PyObject*) self;
9726 }
9727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729 left = marg / 2 + (marg & width & 1);
9730
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009731 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732}
9733
Marc-André Lemburge5034372000-08-08 08:04:29 +00009734#if 0
9735
9736/* This code should go into some future Unicode collation support
9737 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009738 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009739
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009740/* speedy UTF-16 code point order comparison */
9741/* gleaned from: */
9742/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9743
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009744static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009745{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009746 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009747 0, 0, 0, 0, 0, 0, 0, 0,
9748 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009749 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009750};
9751
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752static int
9753unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9754{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009755 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009756
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 Py_UNICODE *s1 = str1->str;
9758 Py_UNICODE *s2 = str2->str;
9759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 len1 = str1->_base._base.length;
9761 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009762
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009764 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009765
9766 c1 = *s1++;
9767 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009768
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 if (c1 > (1<<11) * 26)
9770 c1 += utf16Fixup[c1>>11];
9771 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009772 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009773 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009774
9775 if (c1 != c2)
9776 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009777
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009778 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779 }
9780
9781 return (len1 < len2) ? -1 : (len1 != len2);
9782}
9783
Marc-André Lemburge5034372000-08-08 08:04:29 +00009784#else
9785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786/* This function assumes that str1 and str2 are readied by the caller. */
9787
Marc-André Lemburge5034372000-08-08 08:04:29 +00009788static int
9789unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 int kind1, kind2;
9792 void *data1, *data2;
9793 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 kind1 = PyUnicode_KIND(str1);
9796 kind2 = PyUnicode_KIND(str2);
9797 data1 = PyUnicode_DATA(str1);
9798 data2 = PyUnicode_DATA(str2);
9799 len1 = PyUnicode_GET_LENGTH(str1);
9800 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 for (i = 0; i < len1 && i < len2; ++i) {
9803 Py_UCS4 c1, c2;
9804 c1 = PyUnicode_READ(kind1, data1, i);
9805 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009806
9807 if (c1 != c2)
9808 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009809 }
9810
9811 return (len1 < len2) ? -1 : (len1 != len2);
9812}
9813
9814#endif
9815
Alexander Belopolsky40018472011-02-26 01:02:56 +00009816int
9817PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9820 if (PyUnicode_READY(left) == -1 ||
9821 PyUnicode_READY(right) == -1)
9822 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009823 return unicode_compare((PyUnicodeObject *)left,
9824 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009826 PyErr_Format(PyExc_TypeError,
9827 "Can't compare %.100s and %.100s",
9828 left->ob_type->tp_name,
9829 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830 return -1;
9831}
9832
Martin v. Löwis5b222132007-06-10 09:51:05 +00009833int
9834PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 Py_ssize_t i;
9837 int kind;
9838 void *data;
9839 Py_UCS4 chr;
9840
Victor Stinner910337b2011-10-03 03:20:16 +02009841 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 if (PyUnicode_READY(uni) == -1)
9843 return -1;
9844 kind = PyUnicode_KIND(uni);
9845 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009846 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9848 if (chr != str[i])
9849 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009850 /* This check keeps Python strings that end in '\0' from comparing equal
9851 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009854 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009856 return 0;
9857}
9858
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009859
Benjamin Peterson29060642009-01-31 22:14:21 +00009860#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009862
Alexander Belopolsky40018472011-02-26 01:02:56 +00009863PyObject *
9864PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009865{
9866 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009868 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9869 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (PyUnicode_READY(left) == -1 ||
9871 PyUnicode_READY(right) == -1)
9872 return NULL;
9873 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9874 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009875 if (op == Py_EQ) {
9876 Py_INCREF(Py_False);
9877 return Py_False;
9878 }
9879 if (op == Py_NE) {
9880 Py_INCREF(Py_True);
9881 return Py_True;
9882 }
9883 }
9884 if (left == right)
9885 result = 0;
9886 else
9887 result = unicode_compare((PyUnicodeObject *)left,
9888 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009889
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009890 /* Convert the return value to a Boolean */
9891 switch (op) {
9892 case Py_EQ:
9893 v = TEST_COND(result == 0);
9894 break;
9895 case Py_NE:
9896 v = TEST_COND(result != 0);
9897 break;
9898 case Py_LE:
9899 v = TEST_COND(result <= 0);
9900 break;
9901 case Py_GE:
9902 v = TEST_COND(result >= 0);
9903 break;
9904 case Py_LT:
9905 v = TEST_COND(result == -1);
9906 break;
9907 case Py_GT:
9908 v = TEST_COND(result == 1);
9909 break;
9910 default:
9911 PyErr_BadArgument();
9912 return NULL;
9913 }
9914 Py_INCREF(v);
9915 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009917
Brian Curtindfc80e32011-08-10 20:28:54 -05009918 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009919}
9920
Alexander Belopolsky40018472011-02-26 01:02:56 +00009921int
9922PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009923{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009924 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 int kind1, kind2, kind;
9926 void *buf1, *buf2;
9927 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009928 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009929
9930 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009931 sub = PyUnicode_FromObject(element);
9932 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009933 PyErr_Format(PyExc_TypeError,
9934 "'in <string>' requires string as left operand, not %s",
9935 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009936 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (PyUnicode_READY(sub) == -1)
9939 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009940
Thomas Wouters477c8d52006-05-27 19:21:47 +00009941 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009942 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009943 Py_DECREF(sub);
9944 return -1;
9945 }
9946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 kind1 = PyUnicode_KIND(str);
9948 kind2 = PyUnicode_KIND(sub);
9949 kind = kind1 > kind2 ? kind1 : kind2;
9950 buf1 = PyUnicode_DATA(str);
9951 buf2 = PyUnicode_DATA(sub);
9952 if (kind1 != kind)
9953 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9954 if (!buf1) {
9955 Py_DECREF(sub);
9956 return -1;
9957 }
9958 if (kind2 != kind)
9959 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9960 if (!buf2) {
9961 Py_DECREF(sub);
9962 if (kind1 != kind) PyMem_Free(buf1);
9963 return -1;
9964 }
9965 len1 = PyUnicode_GET_LENGTH(str);
9966 len2 = PyUnicode_GET_LENGTH(sub);
9967
9968 switch(kind) {
9969 case PyUnicode_1BYTE_KIND:
9970 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9971 break;
9972 case PyUnicode_2BYTE_KIND:
9973 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9974 break;
9975 case PyUnicode_4BYTE_KIND:
9976 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9977 break;
9978 default:
9979 result = -1;
9980 assert(0);
9981 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009982
9983 Py_DECREF(str);
9984 Py_DECREF(sub);
9985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 if (kind1 != kind)
9987 PyMem_Free(buf1);
9988 if (kind2 != kind)
9989 PyMem_Free(buf2);
9990
Guido van Rossum403d68b2000-03-13 15:55:09 +00009991 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009992}
9993
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994/* Concat to string or Unicode object giving a new Unicode object. */
9995
Alexander Belopolsky40018472011-02-26 01:02:56 +00009996PyObject *
9997PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 PyObject *u = NULL, *v = NULL, *w;
10000 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
10002 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009
10010 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010011 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010012 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010015 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 }
10019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010021 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 w = PyUnicode_New(
10025 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10026 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010029 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10030 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010031 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010032 v, 0,
10033 PyUnicode_GET_LENGTH(v)) < 0)
10034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 Py_DECREF(u);
10036 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 Py_XDECREF(u);
10041 Py_XDECREF(v);
10042 return NULL;
10043}
10044
Victor Stinnerb0923652011-10-04 01:17:31 +020010045static void
10046unicode_append_inplace(PyObject **p_left, PyObject *right)
10047{
10048 Py_ssize_t left_len, right_len, new_len;
10049#ifdef Py_DEBUG
10050 Py_ssize_t copied;
10051#endif
10052
10053 assert(PyUnicode_IS_READY(*p_left));
10054 assert(PyUnicode_IS_READY(right));
10055
10056 left_len = PyUnicode_GET_LENGTH(*p_left);
10057 right_len = PyUnicode_GET_LENGTH(right);
10058 if (left_len > PY_SSIZE_T_MAX - right_len) {
10059 PyErr_SetString(PyExc_OverflowError,
10060 "strings are too large to concat");
10061 goto error;
10062 }
10063 new_len = left_len + right_len;
10064
10065 /* Now we own the last reference to 'left', so we can resize it
10066 * in-place.
10067 */
10068 if (unicode_resize(p_left, new_len) != 0) {
10069 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10070 * deallocated so it cannot be put back into
10071 * 'variable'. The MemoryError is raised when there
10072 * is no value in 'variable', which might (very
10073 * remotely) be a cause of incompatibilities.
10074 */
10075 goto error;
10076 }
10077 /* copy 'right' into the newly allocated area of 'left' */
10078#ifdef Py_DEBUG
10079 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10080 right, 0,
10081 right_len);
10082 assert(0 <= copied);
10083#else
10084 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10085#endif
10086 return;
10087
10088error:
10089 Py_DECREF(*p_left);
10090 *p_left = NULL;
10091}
10092
Walter Dörwald1ab83302007-05-18 17:15:44 +000010093void
Victor Stinner23e56682011-10-03 03:54:37 +020010094PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010095{
Victor Stinner23e56682011-10-03 03:54:37 +020010096 PyObject *left, *res;
10097
10098 if (p_left == NULL) {
10099 if (!PyErr_Occurred())
10100 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010101 return;
10102 }
Victor Stinner23e56682011-10-03 03:54:37 +020010103 left = *p_left;
10104 if (right == NULL || !PyUnicode_Check(left)) {
10105 if (!PyErr_Occurred())
10106 PyErr_BadInternalCall();
10107 goto error;
10108 }
10109
Victor Stinnere1335c72011-10-04 20:53:03 +020010110 if (PyUnicode_READY(left))
10111 goto error;
10112 if (PyUnicode_READY(right))
10113 goto error;
10114
Victor Stinner23e56682011-10-03 03:54:37 +020010115 if (PyUnicode_CheckExact(left) && left != unicode_empty
10116 && PyUnicode_CheckExact(right) && right != unicode_empty
10117 && unicode_resizable(left)
10118 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10119 || _PyUnicode_WSTR(left) != NULL))
10120 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010121 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10122 to change the structure size, but characters are stored just after
10123 the structure, and so it requires to move all charactres which is
10124 not so different than duplicating the string. */
10125 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010126 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010127 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010128 return;
10129 }
10130 }
10131
10132 res = PyUnicode_Concat(left, right);
10133 if (res == NULL)
10134 goto error;
10135 Py_DECREF(left);
10136 *p_left = res;
10137 return;
10138
10139error:
10140 Py_DECREF(*p_left);
10141 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010142}
10143
10144void
10145PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010147 PyUnicode_Append(pleft, right);
10148 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010149}
10150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010151PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010154Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010155string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010156interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
10158static PyObject *
10159unicode_count(PyUnicodeObject *self, PyObject *args)
10160{
10161 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010162 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010163 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 int kind1, kind2, kind;
10166 void *buf1, *buf2;
10167 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
Jesus Ceaac451502011-04-20 17:09:23 +020010169 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10170 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 kind1 = PyUnicode_KIND(self);
10174 kind2 = PyUnicode_KIND(substring);
10175 kind = kind1 > kind2 ? kind1 : kind2;
10176 buf1 = PyUnicode_DATA(self);
10177 buf2 = PyUnicode_DATA(substring);
10178 if (kind1 != kind)
10179 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10180 if (!buf1) {
10181 Py_DECREF(substring);
10182 return NULL;
10183 }
10184 if (kind2 != kind)
10185 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10186 if (!buf2) {
10187 Py_DECREF(substring);
10188 if (kind1 != kind) PyMem_Free(buf1);
10189 return NULL;
10190 }
10191 len1 = PyUnicode_GET_LENGTH(self);
10192 len2 = PyUnicode_GET_LENGTH(substring);
10193
10194 ADJUST_INDICES(start, end, len1);
10195 switch(kind) {
10196 case PyUnicode_1BYTE_KIND:
10197 iresult = ucs1lib_count(
10198 ((Py_UCS1*)buf1) + start, end - start,
10199 buf2, len2, PY_SSIZE_T_MAX
10200 );
10201 break;
10202 case PyUnicode_2BYTE_KIND:
10203 iresult = ucs2lib_count(
10204 ((Py_UCS2*)buf1) + start, end - start,
10205 buf2, len2, PY_SSIZE_T_MAX
10206 );
10207 break;
10208 case PyUnicode_4BYTE_KIND:
10209 iresult = ucs4lib_count(
10210 ((Py_UCS4*)buf1) + start, end - start,
10211 buf2, len2, PY_SSIZE_T_MAX
10212 );
10213 break;
10214 default:
10215 assert(0); iresult = 0;
10216 }
10217
10218 result = PyLong_FromSsize_t(iresult);
10219
10220 if (kind1 != kind)
10221 PyMem_Free(buf1);
10222 if (kind2 != kind)
10223 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010226
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 return result;
10228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010231 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010233Encode S using the codec registered for encoding. Default encoding\n\
10234is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010235handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010236a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10237'xmlcharrefreplace' as well as any other name registered with\n\
10238codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
10240static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010241unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010243 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244 char *encoding = NULL;
10245 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010246
Benjamin Peterson308d6372009-09-18 21:42:35 +000010247 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10248 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010250 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010251}
10252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010253PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255\n\
10256Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010257If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
10259static PyObject*
10260unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10261{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010262 Py_ssize_t i, j, line_pos, src_len, incr;
10263 Py_UCS4 ch;
10264 PyObject *u;
10265 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010267 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010268 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
10270 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
Antoine Pitrou22425222011-10-04 19:10:51 +020010273 if (PyUnicode_READY(self) == -1)
10274 return NULL;
10275
Thomas Wouters7e474022000-07-16 12:04:32 +000010276 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010277 src_len = PyUnicode_GET_LENGTH(self);
10278 i = j = line_pos = 0;
10279 kind = PyUnicode_KIND(self);
10280 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010281 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010282 for (; i < src_len; i++) {
10283 ch = PyUnicode_READ(kind, src_data, i);
10284 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010285 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010287 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010289 goto overflow;
10290 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010292 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010296 goto overflow;
10297 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010299 if (ch == '\n' || ch == '\r')
10300 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010302 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010303 if (!found && PyUnicode_CheckExact(self)) {
10304 Py_INCREF((PyObject *) self);
10305 return (PyObject *) self;
10306 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010307
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010309 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310 if (!u)
10311 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010312 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
Antoine Pitroue71d5742011-10-04 15:55:09 +020010314 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
Antoine Pitroue71d5742011-10-04 15:55:09 +020010316 for (; i < src_len; i++) {
10317 ch = PyUnicode_READ(kind, src_data, i);
10318 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010319 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010320 incr = tabsize - (line_pos % tabsize);
10321 line_pos += incr;
10322 while (incr--) {
10323 PyUnicode_WRITE(kind, dest_data, j, ' ');
10324 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010325 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010327 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010329 line_pos++;
10330 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010331 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010332 if (ch == '\n' || ch == '\r')
10333 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010335 }
10336 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010337#ifndef DONT_MAKE_RESULT_READY
10338 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 Py_DECREF(u);
10340 return NULL;
10341 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010342#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010344
Antoine Pitroue71d5742011-10-04 15:55:09 +020010345 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010346 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010350PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352\n\
10353Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010354such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355arguments start and end are interpreted as in slice notation.\n\
10356\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010357Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
10359static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Jesus Ceaac451502011-04-20 17:09:23 +020010362 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010363 Py_ssize_t start;
10364 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010365 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
Jesus Ceaac451502011-04-20 17:09:23 +020010367 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10368 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (PyUnicode_READY(self) == -1)
10372 return NULL;
10373 if (PyUnicode_READY(substring) == -1)
10374 return NULL;
10375
10376 result = any_find_slice(
10377 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10378 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010379 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
10381 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (result == -2)
10384 return NULL;
10385
Christian Heimes217cfd12007-12-02 14:31:20 +000010386 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387}
10388
10389static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010390unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010392 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10393 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396}
10397
Guido van Rossumc2504932007-09-18 19:42:40 +000010398/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010399 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010400static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010401unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402{
Guido van Rossumc2504932007-09-18 19:42:40 +000010403 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010404 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 if (_PyUnicode_HASH(self) != -1)
10407 return _PyUnicode_HASH(self);
10408 if (PyUnicode_READY(self) == -1)
10409 return -1;
10410 len = PyUnicode_GET_LENGTH(self);
10411
10412 /* The hash function as a macro, gets expanded three times below. */
10413#define HASH(P) \
10414 x = (Py_uhash_t)*P << 7; \
10415 while (--len >= 0) \
10416 x = (1000003*x) ^ (Py_uhash_t)*P++;
10417
10418 switch (PyUnicode_KIND(self)) {
10419 case PyUnicode_1BYTE_KIND: {
10420 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10421 HASH(c);
10422 break;
10423 }
10424 case PyUnicode_2BYTE_KIND: {
10425 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10426 HASH(s);
10427 break;
10428 }
10429 default: {
10430 Py_UCS4 *l;
10431 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10432 "Impossible switch case in unicode_hash");
10433 l = PyUnicode_4BYTE_DATA(self);
10434 HASH(l);
10435 break;
10436 }
10437 }
10438 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10439
Guido van Rossumc2504932007-09-18 19:42:40 +000010440 if (x == -1)
10441 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010443 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010447PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010450Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
10452static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010455 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010456 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010457 Py_ssize_t start;
10458 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459
Jesus Ceaac451502011-04-20 17:09:23 +020010460 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10461 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (PyUnicode_READY(self) == -1)
10465 return NULL;
10466 if (PyUnicode_READY(substring) == -1)
10467 return NULL;
10468
10469 result = any_find_slice(
10470 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10471 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
10474 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (result == -2)
10477 return NULL;
10478
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 if (result < 0) {
10480 PyErr_SetString(PyExc_ValueError, "substring not found");
10481 return NULL;
10482 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483
Christian Heimes217cfd12007-12-02 14:31:20 +000010484 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485}
10486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010487PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010490Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010491at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010494unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 Py_ssize_t i, length;
10497 int kind;
10498 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499 int cased;
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (PyUnicode_READY(self) == -1)
10502 return NULL;
10503 length = PyUnicode_GET_LENGTH(self);
10504 kind = PyUnicode_KIND(self);
10505 data = PyUnicode_DATA(self);
10506
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (length == 1)
10509 return PyBool_FromLong(
10510 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010512 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010515
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 for (i = 0; i < length; i++) {
10518 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010519
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10521 return PyBool_FromLong(0);
10522 else if (!cased && Py_UNICODE_ISLOWER(ch))
10523 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010525 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526}
10527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010528PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010529 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010531Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010532at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
10534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010535unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 Py_ssize_t i, length;
10538 int kind;
10539 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540 int cased;
10541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (PyUnicode_READY(self) == -1)
10543 return NULL;
10544 length = PyUnicode_GET_LENGTH(self);
10545 kind = PyUnicode_KIND(self);
10546 data = PyUnicode_DATA(self);
10547
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (length == 1)
10550 return PyBool_FromLong(
10551 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010556
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 for (i = 0; i < length; i++) {
10559 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010560
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10562 return PyBool_FromLong(0);
10563 else if (!cased && Py_UNICODE_ISUPPER(ch))
10564 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010566 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567}
10568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010569PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010572Return True if S is a titlecased string and there is at least one\n\
10573character in S, i.e. upper- and titlecase characters may only\n\
10574follow uncased characters and lowercase characters only cased ones.\n\
10575Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576
10577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010578unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_ssize_t i, length;
10581 int kind;
10582 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583 int cased, previous_is_cased;
10584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (PyUnicode_READY(self) == -1)
10586 return NULL;
10587 length = PyUnicode_GET_LENGTH(self);
10588 kind = PyUnicode_KIND(self);
10589 data = PyUnicode_DATA(self);
10590
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (length == 1) {
10593 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10594 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10595 (Py_UNICODE_ISUPPER(ch) != 0));
10596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010598 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010600 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010601
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 cased = 0;
10603 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 for (i = 0; i < length; i++) {
10605 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010606
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10608 if (previous_is_cased)
10609 return PyBool_FromLong(0);
10610 previous_is_cased = 1;
10611 cased = 1;
10612 }
10613 else if (Py_UNICODE_ISLOWER(ch)) {
10614 if (!previous_is_cased)
10615 return PyBool_FromLong(0);
10616 previous_is_cased = 1;
10617 cased = 1;
10618 }
10619 else
10620 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010622 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623}
10624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010625PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010628Return True if all characters in S are whitespace\n\
10629and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630
10631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010632unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 Py_ssize_t i, length;
10635 int kind;
10636 void *data;
10637
10638 if (PyUnicode_READY(self) == -1)
10639 return NULL;
10640 length = PyUnicode_GET_LENGTH(self);
10641 kind = PyUnicode_KIND(self);
10642 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (length == 1)
10646 return PyBool_FromLong(
10647 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010649 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 for (i = 0; i < length; i++) {
10654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010655 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010658 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659}
10660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010661PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010663\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010664Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010665and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010666
10667static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010668unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 Py_ssize_t i, length;
10671 int kind;
10672 void *data;
10673
10674 if (PyUnicode_READY(self) == -1)
10675 return NULL;
10676 length = PyUnicode_GET_LENGTH(self);
10677 kind = PyUnicode_KIND(self);
10678 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010679
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010680 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (length == 1)
10682 return PyBool_FromLong(
10683 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010684
10685 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010687 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 for (i = 0; i < length; i++) {
10690 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010692 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010693 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010694}
10695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010696PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010697 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010698\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010699Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010701
10702static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010703unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 int kind;
10706 void *data;
10707 Py_ssize_t len, i;
10708
10709 if (PyUnicode_READY(self) == -1)
10710 return NULL;
10711
10712 kind = PyUnicode_KIND(self);
10713 data = PyUnicode_DATA(self);
10714 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010715
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010716 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (len == 1) {
10718 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10719 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10720 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010721
10722 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 for (i = 0; i < len; i++) {
10727 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010728 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010730 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010731 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010732}
10733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010734PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010737Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010738False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010741unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 Py_ssize_t i, length;
10744 int kind;
10745 void *data;
10746
10747 if (PyUnicode_READY(self) == -1)
10748 return NULL;
10749 length = PyUnicode_GET_LENGTH(self);
10750 kind = PyUnicode_KIND(self);
10751 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (length == 1)
10755 return PyBool_FromLong(
10756 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010758 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 for (i = 0; i < length; i++) {
10763 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010766 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767}
10768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010769PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010770 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010772Return True if all characters in S are digits\n\
10773and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
10775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010776unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 Py_ssize_t i, length;
10779 int kind;
10780 void *data;
10781
10782 if (PyUnicode_READY(self) == -1)
10783 return NULL;
10784 length = PyUnicode_GET_LENGTH(self);
10785 kind = PyUnicode_KIND(self);
10786 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 if (length == 1) {
10790 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10791 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010794 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 for (i = 0; i < length; i++) {
10799 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010802 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803}
10804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010805PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010806 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010808Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010809False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810
10811static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010812unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 Py_ssize_t i, length;
10815 int kind;
10816 void *data;
10817
10818 if (PyUnicode_READY(self) == -1)
10819 return NULL;
10820 length = PyUnicode_GET_LENGTH(self);
10821 kind = PyUnicode_KIND(self);
10822 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 if (length == 1)
10826 return PyBool_FromLong(
10827 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010829 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010831 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 for (i = 0; i < length; i++) {
10834 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010837 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838}
10839
Martin v. Löwis47383402007-08-15 07:32:56 +000010840int
10841PyUnicode_IsIdentifier(PyObject *self)
10842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind;
10844 void *data;
10845 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010846 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (PyUnicode_READY(self) == -1) {
10849 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 }
10852
10853 /* Special case for empty strings */
10854 if (PyUnicode_GET_LENGTH(self) == 0)
10855 return 0;
10856 kind = PyUnicode_KIND(self);
10857 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010858
10859 /* PEP 3131 says that the first character must be in
10860 XID_Start and subsequent characters in XID_Continue,
10861 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010862 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010863 letters, digits, underscore). However, given the current
10864 definition of XID_Start and XID_Continue, it is sufficient
10865 to check just for these, except that _ must be allowed
10866 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010868 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010869 return 0;
10870
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010871 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010874 return 1;
10875}
10876
10877PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010879\n\
10880Return True if S is a valid identifier according\n\
10881to the language definition.");
10882
10883static PyObject*
10884unicode_isidentifier(PyObject *self)
10885{
10886 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10887}
10888
Georg Brandl559e5d72008-06-11 18:37:52 +000010889PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010891\n\
10892Return True if all characters in S are considered\n\
10893printable in repr() or S is empty, False otherwise.");
10894
10895static PyObject*
10896unicode_isprintable(PyObject *self)
10897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 Py_ssize_t i, length;
10899 int kind;
10900 void *data;
10901
10902 if (PyUnicode_READY(self) == -1)
10903 return NULL;
10904 length = PyUnicode_GET_LENGTH(self);
10905 kind = PyUnicode_KIND(self);
10906 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010907
10908 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (length == 1)
10910 return PyBool_FromLong(
10911 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 for (i = 0; i < length; i++) {
10914 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010915 Py_RETURN_FALSE;
10916 }
10917 }
10918 Py_RETURN_TRUE;
10919}
10920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010922 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923\n\
10924Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010925iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926
10927static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010928unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010930 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931}
10932
Martin v. Löwis18e16552006-02-15 17:27:45 +000010933static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934unicode_length(PyUnicodeObject *self)
10935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (PyUnicode_READY(self) == -1)
10937 return -1;
10938 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
10940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010944Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010945done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject *
10948unicode_ljust(PyUnicodeObject *self, PyObject *args)
10949{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010950 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 Py_UCS4 fillchar = ' ';
10952
10953 if (PyUnicode_READY(self) == -1)
10954 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010955
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010956 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 return NULL;
10958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 Py_INCREF(self);
10961 return (PyObject*) self;
10962 }
10963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965}
10966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010967PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010968 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010970Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
10972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010973unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 return fixup(self, fixlower);
10976}
10977
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010978#define LEFTSTRIP 0
10979#define RIGHTSTRIP 1
10980#define BOTHSTRIP 2
10981
10982/* Arrays indexed by above */
10983static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10984
10985#define STRIPNAME(i) (stripformat[i]+3)
10986
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010987/* externally visible for str.strip(unicode) */
10988PyObject *
10989_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 void *data;
10992 int kind;
10993 Py_ssize_t i, j, len;
10994 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10997 return NULL;
10998
10999 kind = PyUnicode_KIND(self);
11000 data = PyUnicode_DATA(self);
11001 len = PyUnicode_GET_LENGTH(self);
11002 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11003 PyUnicode_DATA(sepobj),
11004 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005
Benjamin Peterson14339b62009-01-31 16:36:08 +000011006 i = 0;
11007 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 while (i < len &&
11009 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 i++;
11011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011012 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011013
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014 j = len;
11015 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 do {
11017 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 } while (j >= i &&
11019 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011021 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011022
Victor Stinner12bab6d2011-10-01 01:53:49 +020011023 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024}
11025
11026PyObject*
11027PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11028{
11029 unsigned char *data;
11030 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011031 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032
Victor Stinnerde636f32011-10-01 03:55:54 +020011033 if (PyUnicode_READY(self) == -1)
11034 return NULL;
11035
11036 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11037
Victor Stinner12bab6d2011-10-01 01:53:49 +020011038 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011040 if (PyUnicode_CheckExact(self)) {
11041 Py_INCREF(self);
11042 return self;
11043 }
11044 else
11045 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 }
11047
Victor Stinner12bab6d2011-10-01 01:53:49 +020011048 length = end - start;
11049 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011050 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051
Victor Stinnerde636f32011-10-01 03:55:54 +020011052 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011053 PyErr_SetString(PyExc_IndexError, "string index out of range");
11054 return NULL;
11055 }
11056
Victor Stinnerb9275c12011-10-05 14:01:42 +020011057 if (PyUnicode_IS_ASCII(self)) {
11058 kind = PyUnicode_KIND(self);
11059 data = PyUnicode_1BYTE_DATA(self);
11060 return unicode_fromascii(data + start, length);
11061 }
11062 else {
11063 kind = PyUnicode_KIND(self);
11064 data = PyUnicode_1BYTE_DATA(self);
11065 return PyUnicode_FromKindAndData(kind,
11066 data + PyUnicode_KIND_SIZE(kind, start),
11067 length);
11068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
11071static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011072do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 int kind;
11075 void *data;
11076 Py_ssize_t len, i, j;
11077
11078 if (PyUnicode_READY(self) == -1)
11079 return NULL;
11080
11081 kind = PyUnicode_KIND(self);
11082 data = PyUnicode_DATA(self);
11083 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011084
Benjamin Peterson14339b62009-01-31 16:36:08 +000011085 i = 0;
11086 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011088 i++;
11089 }
11090 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011091
Benjamin Peterson14339b62009-01-31 16:36:08 +000011092 j = len;
11093 if (striptype != LEFTSTRIP) {
11094 do {
11095 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011097 j++;
11098 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011099
Victor Stinner12bab6d2011-10-01 01:53:49 +020011100 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101}
11102
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011103
11104static PyObject *
11105do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011107 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011108
Benjamin Peterson14339b62009-01-31 16:36:08 +000011109 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11110 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011111
Benjamin Peterson14339b62009-01-31 16:36:08 +000011112 if (sep != NULL && sep != Py_None) {
11113 if (PyUnicode_Check(sep))
11114 return _PyUnicode_XStrip(self, striptype, sep);
11115 else {
11116 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "%s arg must be None or str",
11118 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011119 return NULL;
11120 }
11121 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011122
Benjamin Peterson14339b62009-01-31 16:36:08 +000011123 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011124}
11125
11126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011127PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011128 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011129\n\
11130Return a copy of the string S with leading and trailing\n\
11131whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011132If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011133
11134static PyObject *
11135unicode_strip(PyUnicodeObject *self, PyObject *args)
11136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011137 if (PyTuple_GET_SIZE(args) == 0)
11138 return do_strip(self, BOTHSTRIP); /* Common case */
11139 else
11140 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011141}
11142
11143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011144PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011146\n\
11147Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011148If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011149
11150static PyObject *
11151unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011153 if (PyTuple_GET_SIZE(args) == 0)
11154 return do_strip(self, LEFTSTRIP); /* Common case */
11155 else
11156 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011157}
11158
11159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011160PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011162\n\
11163Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011164If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011165
11166static PyObject *
11167unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11168{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011169 if (PyTuple_GET_SIZE(args) == 0)
11170 return do_strip(self, RIGHTSTRIP); /* Common case */
11171 else
11172 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011173}
11174
11175
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011177unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178{
11179 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
Georg Brandl222de0f2009-04-12 12:01:50 +000011182 if (len < 1) {
11183 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011184 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
Tim Peters7a29bd52001-09-12 03:03:31 +000011187 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 /* no repeat, return original string */
11189 Py_INCREF(str);
11190 return (PyObject*) str;
11191 }
Tim Peters8f422462000-09-09 06:13:41 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(str) == -1)
11194 return NULL;
11195
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011196 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011197 PyErr_SetString(PyExc_OverflowError,
11198 "repeated string is too long");
11199 return NULL;
11200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 if (!u)
11205 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011206 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (PyUnicode_GET_LENGTH(str) == 1) {
11209 const int kind = PyUnicode_KIND(str);
11210 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11211 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011212 if (kind == PyUnicode_1BYTE_KIND)
11213 memset(to, (unsigned char)fill_char, len);
11214 else {
11215 for (n = 0; n < len; ++n)
11216 PyUnicode_WRITE(kind, to, n, fill_char);
11217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 }
11219 else {
11220 /* number of characters copied this far */
11221 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11222 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11223 char *to = (char *) PyUnicode_DATA(u);
11224 Py_MEMCPY(to, PyUnicode_DATA(str),
11225 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 n = (done <= nchars-done) ? done : nchars-done;
11228 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011229 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 }
11232
11233 return (PyObject*) u;
11234}
11235
Alexander Belopolsky40018472011-02-26 01:02:56 +000011236PyObject *
11237PyUnicode_Replace(PyObject *obj,
11238 PyObject *subobj,
11239 PyObject *replobj,
11240 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
11242 PyObject *self;
11243 PyObject *str1;
11244 PyObject *str2;
11245 PyObject *result;
11246
11247 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011248 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011251 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 Py_DECREF(self);
11253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254 }
11255 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011256 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 Py_DECREF(self);
11258 Py_DECREF(str1);
11259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 Py_DECREF(self);
11263 Py_DECREF(str1);
11264 Py_DECREF(str2);
11265 return result;
11266}
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011269 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
11271Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011272old replaced by new. If the optional argument count is\n\
11273given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274
11275static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 PyObject *str1;
11279 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011280 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 PyObject *result;
11282
Martin v. Löwis18e16552006-02-15 17:27:45 +000011283 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 str1 = PyUnicode_FromObject(str1);
11288 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11289 return NULL;
11290 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011291 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 Py_DECREF(str1);
11293 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296 result = replace(self, str1, str2, maxcount);
11297
11298 Py_DECREF(str1);
11299 Py_DECREF(str2);
11300 return result;
11301}
11302
Alexander Belopolsky40018472011-02-26 01:02:56 +000011303static PyObject *
11304unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011306 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 Py_ssize_t isize;
11308 Py_ssize_t osize, squote, dquote, i, o;
11309 Py_UCS4 max, quote;
11310 int ikind, okind;
11311 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011314 return NULL;
11315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 isize = PyUnicode_GET_LENGTH(unicode);
11317 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 /* Compute length of output, quote characters, and
11320 maximum character */
11321 osize = 2; /* quotes */
11322 max = 127;
11323 squote = dquote = 0;
11324 ikind = PyUnicode_KIND(unicode);
11325 for (i = 0; i < isize; i++) {
11326 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11327 switch (ch) {
11328 case '\'': squote++; osize++; break;
11329 case '"': dquote++; osize++; break;
11330 case '\\': case '\t': case '\r': case '\n':
11331 osize += 2; break;
11332 default:
11333 /* Fast-path ASCII */
11334 if (ch < ' ' || ch == 0x7f)
11335 osize += 4; /* \xHH */
11336 else if (ch < 0x7f)
11337 osize++;
11338 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11339 osize++;
11340 max = ch > max ? ch : max;
11341 }
11342 else if (ch < 0x100)
11343 osize += 4; /* \xHH */
11344 else if (ch < 0x10000)
11345 osize += 6; /* \uHHHH */
11346 else
11347 osize += 10; /* \uHHHHHHHH */
11348 }
11349 }
11350
11351 quote = '\'';
11352 if (squote) {
11353 if (dquote)
11354 /* Both squote and dquote present. Use squote,
11355 and escape them */
11356 osize += squote;
11357 else
11358 quote = '"';
11359 }
11360
11361 repr = PyUnicode_New(osize, max);
11362 if (repr == NULL)
11363 return NULL;
11364 okind = PyUnicode_KIND(repr);
11365 odata = PyUnicode_DATA(repr);
11366
11367 PyUnicode_WRITE(okind, odata, 0, quote);
11368 PyUnicode_WRITE(okind, odata, osize-1, quote);
11369
11370 for (i = 0, o = 1; i < isize; i++) {
11371 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011372
11373 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if ((ch == quote) || (ch == '\\')) {
11375 PyUnicode_WRITE(okind, odata, o++, '\\');
11376 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011377 continue;
11378 }
11379
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011381 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 PyUnicode_WRITE(okind, odata, o++, '\\');
11383 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011384 }
11385 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 PyUnicode_WRITE(okind, odata, o++, '\\');
11387 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011388 }
11389 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 PyUnicode_WRITE(okind, odata, o++, '\\');
11391 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011392 }
11393
11394 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011395 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 PyUnicode_WRITE(okind, odata, o++, '\\');
11397 PyUnicode_WRITE(okind, odata, o++, 'x');
11398 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11399 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011400 }
11401
Georg Brandl559e5d72008-06-11 18:37:52 +000011402 /* Copy ASCII characters as-is */
11403 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011405 }
11406
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011408 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011409 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011410 (categories Z* and C* except ASCII space)
11411 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011413 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 if (ch <= 0xff) {
11415 PyUnicode_WRITE(okind, odata, o++, '\\');
11416 PyUnicode_WRITE(okind, odata, o++, 'x');
11417 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11418 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011419 }
11420 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 else if (ch >= 0x10000) {
11422 PyUnicode_WRITE(okind, odata, o++, '\\');
11423 PyUnicode_WRITE(okind, odata, o++, 'U');
11424 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11425 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11426 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11427 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11428 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11429 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11430 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11431 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011432 }
11433 /* Map 16-bit characters to '\uxxxx' */
11434 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 PyUnicode_WRITE(okind, odata, o++, '\\');
11436 PyUnicode_WRITE(okind, odata, o++, 'u');
11437 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11438 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11439 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11440 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011441 }
11442 }
11443 /* Copy characters as-is */
11444 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011446 }
11447 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011450 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451}
11452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011453PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455\n\
11456Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011457such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458arguments start and end are interpreted as in slice notation.\n\
11459\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Jesus Ceaac451502011-04-20 17:09:23 +020011465 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011466 Py_ssize_t start;
11467 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011468 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Jesus Ceaac451502011-04-20 17:09:23 +020011470 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11471 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (PyUnicode_READY(self) == -1)
11475 return NULL;
11476 if (PyUnicode_READY(substring) == -1)
11477 return NULL;
11478
11479 result = any_find_slice(
11480 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11481 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011482 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (result == -2)
11487 return NULL;
11488
Christian Heimes217cfd12007-12-02 14:31:20 +000011489 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490}
11491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
11497static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Jesus Ceaac451502011-04-20 17:09:23 +020011500 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011501 Py_ssize_t start;
11502 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Jesus Ceaac451502011-04-20 17:09:23 +020011505 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11506 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (PyUnicode_READY(self) == -1)
11510 return NULL;
11511 if (PyUnicode_READY(substring) == -1)
11512 return NULL;
11513
11514 result = any_find_slice(
11515 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11516 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
11519 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (result == -2)
11522 return NULL;
11523
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 if (result < 0) {
11525 PyErr_SetString(PyExc_ValueError, "substring not found");
11526 return NULL;
11527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528
Christian Heimes217cfd12007-12-02 14:31:20 +000011529 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530}
11531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011535Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011536done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject *
11539unicode_rjust(PyUnicodeObject *self, PyObject *args)
11540{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011541 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 Py_UCS4 fillchar = ' ';
11543
Victor Stinnere9a29352011-10-01 02:14:59 +020011544 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011546
Victor Stinnere9a29352011-10-01 02:14:59 +020011547 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 return NULL;
11549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 Py_INCREF(self);
11552 return (PyObject*) self;
11553 }
11554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
Alexander Belopolsky40018472011-02-26 01:02:56 +000011558PyObject *
11559PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
11561 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011562
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 s = PyUnicode_FromObject(s);
11564 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011565 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011566 if (sep != NULL) {
11567 sep = PyUnicode_FromObject(sep);
11568 if (sep == NULL) {
11569 Py_DECREF(s);
11570 return NULL;
11571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 }
11573
11574 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11575
11576 Py_DECREF(s);
11577 Py_XDECREF(sep);
11578 return result;
11579}
11580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583\n\
11584Return a list of the words in S, using sep as the\n\
11585delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011586splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011587whitespace string is a separator and empty strings are\n\
11588removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589
11590static PyObject*
11591unicode_split(PyUnicodeObject *self, PyObject *args)
11592{
11593 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011594 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
Martin v. Löwis18e16552006-02-15 17:27:45 +000011596 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597 return NULL;
11598
11599 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605}
11606
Thomas Wouters477c8d52006-05-27 19:21:47 +000011607PyObject *
11608PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11609{
11610 PyObject* str_obj;
11611 PyObject* sep_obj;
11612 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 int kind1, kind2, kind;
11614 void *buf1 = NULL, *buf2 = NULL;
11615 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011616
11617 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011618 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622 Py_DECREF(str_obj);
11623 return NULL;
11624 }
11625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 kind1 = PyUnicode_KIND(str_in);
11627 kind2 = PyUnicode_KIND(sep_obj);
11628 kind = kind1 > kind2 ? kind1 : kind2;
11629 buf1 = PyUnicode_DATA(str_in);
11630 if (kind1 != kind)
11631 buf1 = _PyUnicode_AsKind(str_in, kind);
11632 if (!buf1)
11633 goto onError;
11634 buf2 = PyUnicode_DATA(sep_obj);
11635 if (kind2 != kind)
11636 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11637 if (!buf2)
11638 goto onError;
11639 len1 = PyUnicode_GET_LENGTH(str_obj);
11640 len2 = PyUnicode_GET_LENGTH(sep_obj);
11641
11642 switch(PyUnicode_KIND(str_in)) {
11643 case PyUnicode_1BYTE_KIND:
11644 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11645 break;
11646 case PyUnicode_2BYTE_KIND:
11647 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11648 break;
11649 case PyUnicode_4BYTE_KIND:
11650 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11651 break;
11652 default:
11653 assert(0);
11654 out = 0;
11655 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011656
11657 Py_DECREF(sep_obj);
11658 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 if (kind1 != kind)
11660 PyMem_Free(buf1);
11661 if (kind2 != kind)
11662 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011663
11664 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 onError:
11666 Py_DECREF(sep_obj);
11667 Py_DECREF(str_obj);
11668 if (kind1 != kind && buf1)
11669 PyMem_Free(buf1);
11670 if (kind2 != kind && buf2)
11671 PyMem_Free(buf2);
11672 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011673}
11674
11675
11676PyObject *
11677PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11678{
11679 PyObject* str_obj;
11680 PyObject* sep_obj;
11681 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 int kind1, kind2, kind;
11683 void *buf1 = NULL, *buf2 = NULL;
11684 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011685
11686 str_obj = PyUnicode_FromObject(str_in);
11687 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011689 sep_obj = PyUnicode_FromObject(sep_in);
11690 if (!sep_obj) {
11691 Py_DECREF(str_obj);
11692 return NULL;
11693 }
11694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 kind1 = PyUnicode_KIND(str_in);
11696 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011697 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 buf1 = PyUnicode_DATA(str_in);
11699 if (kind1 != kind)
11700 buf1 = _PyUnicode_AsKind(str_in, kind);
11701 if (!buf1)
11702 goto onError;
11703 buf2 = PyUnicode_DATA(sep_obj);
11704 if (kind2 != kind)
11705 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11706 if (!buf2)
11707 goto onError;
11708 len1 = PyUnicode_GET_LENGTH(str_obj);
11709 len2 = PyUnicode_GET_LENGTH(sep_obj);
11710
11711 switch(PyUnicode_KIND(str_in)) {
11712 case PyUnicode_1BYTE_KIND:
11713 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11714 break;
11715 case PyUnicode_2BYTE_KIND:
11716 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11717 break;
11718 case PyUnicode_4BYTE_KIND:
11719 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11720 break;
11721 default:
11722 assert(0);
11723 out = 0;
11724 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011725
11726 Py_DECREF(sep_obj);
11727 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (kind1 != kind)
11729 PyMem_Free(buf1);
11730 if (kind2 != kind)
11731 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011732
11733 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 onError:
11735 Py_DECREF(sep_obj);
11736 Py_DECREF(str_obj);
11737 if (kind1 != kind && buf1)
11738 PyMem_Free(buf1);
11739 if (kind2 != kind && buf2)
11740 PyMem_Free(buf2);
11741 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011742}
11743
11744PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011747Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011748the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011749found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011750
11751static PyObject*
11752unicode_partition(PyUnicodeObject *self, PyObject *separator)
11753{
11754 return PyUnicode_Partition((PyObject *)self, separator);
11755}
11756
11757PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011758 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011759\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011760Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011762separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011763
11764static PyObject*
11765unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11766{
11767 return PyUnicode_RPartition((PyObject *)self, separator);
11768}
11769
Alexander Belopolsky40018472011-02-26 01:02:56 +000011770PyObject *
11771PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011772{
11773 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011775 s = PyUnicode_FromObject(s);
11776 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 if (sep != NULL) {
11779 sep = PyUnicode_FromObject(sep);
11780 if (sep == NULL) {
11781 Py_DECREF(s);
11782 return NULL;
11783 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011784 }
11785
11786 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11787
11788 Py_DECREF(s);
11789 Py_XDECREF(sep);
11790 return result;
11791}
11792
11793PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011795\n\
11796Return a list of the words in S, using sep as the\n\
11797delimiter string, starting at the end of the string and\n\
11798working to the front. If maxsplit is given, at most maxsplit\n\
11799splits are done. If sep is not specified, any whitespace string\n\
11800is a separator.");
11801
11802static PyObject*
11803unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11804{
11805 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011806 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011807
Martin v. Löwis18e16552006-02-15 17:27:45 +000011808 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011809 return NULL;
11810
11811 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011813 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011815 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011817}
11818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821\n\
11822Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011823Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011824is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
11826static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011827unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011829 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011830 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011832 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11833 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 return NULL;
11835
Guido van Rossum86662912000-04-11 15:38:46 +000011836 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837}
11838
11839static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011840PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Walter Dörwald346737f2007-05-31 10:44:43 +000011842 if (PyUnicode_CheckExact(self)) {
11843 Py_INCREF(self);
11844 return self;
11845 } else
11846 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011847 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848}
11849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011850PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852\n\
11853Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011854and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
11856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011857unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 return fixup(self, fixswapcase);
11860}
11861
Georg Brandlceee0772007-11-27 23:48:05 +000011862PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011864\n\
11865Return a translation table usable for str.translate().\n\
11866If there is only one argument, it must be a dictionary mapping Unicode\n\
11867ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011868Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011869If there are two arguments, they must be strings of equal length, and\n\
11870in the resulting dictionary, each character in x will be mapped to the\n\
11871character at the same position in y. If there is a third argument, it\n\
11872must be a string, whose characters will be mapped to None in the result.");
11873
11874static PyObject*
11875unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11876{
11877 PyObject *x, *y = NULL, *z = NULL;
11878 PyObject *new = NULL, *key, *value;
11879 Py_ssize_t i = 0;
11880 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011881
Georg Brandlceee0772007-11-27 23:48:05 +000011882 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11883 return NULL;
11884 new = PyDict_New();
11885 if (!new)
11886 return NULL;
11887 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 int x_kind, y_kind, z_kind;
11889 void *x_data, *y_data, *z_data;
11890
Georg Brandlceee0772007-11-27 23:48:05 +000011891 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011892 if (!PyUnicode_Check(x)) {
11893 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11894 "be a string if there is a second argument");
11895 goto err;
11896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011898 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11899 "arguments must have equal length");
11900 goto err;
11901 }
11902 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 x_kind = PyUnicode_KIND(x);
11904 y_kind = PyUnicode_KIND(y);
11905 x_data = PyUnicode_DATA(x);
11906 y_data = PyUnicode_DATA(y);
11907 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11908 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11909 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011910 if (!key || !value)
11911 goto err;
11912 res = PyDict_SetItem(new, key, value);
11913 Py_DECREF(key);
11914 Py_DECREF(value);
11915 if (res < 0)
11916 goto err;
11917 }
11918 /* create entries for deleting chars in z */
11919 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 z_kind = PyUnicode_KIND(z);
11921 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011922 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011924 if (!key)
11925 goto err;
11926 res = PyDict_SetItem(new, key, Py_None);
11927 Py_DECREF(key);
11928 if (res < 0)
11929 goto err;
11930 }
11931 }
11932 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 int kind;
11934 void *data;
11935
Georg Brandlceee0772007-11-27 23:48:05 +000011936 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011937 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011938 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11939 "to maketrans it must be a dict");
11940 goto err;
11941 }
11942 /* copy entries into the new dict, converting string keys to int keys */
11943 while (PyDict_Next(x, &i, &key, &value)) {
11944 if (PyUnicode_Check(key)) {
11945 /* convert string keys to integer keys */
11946 PyObject *newkey;
11947 if (PyUnicode_GET_SIZE(key) != 1) {
11948 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11949 "table must be of length 1");
11950 goto err;
11951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 kind = PyUnicode_KIND(key);
11953 data = PyUnicode_DATA(key);
11954 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011955 if (!newkey)
11956 goto err;
11957 res = PyDict_SetItem(new, newkey, value);
11958 Py_DECREF(newkey);
11959 if (res < 0)
11960 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011961 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011962 /* just keep integer keys */
11963 if (PyDict_SetItem(new, key, value) < 0)
11964 goto err;
11965 } else {
11966 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11967 "be strings or integers");
11968 goto err;
11969 }
11970 }
11971 }
11972 return new;
11973 err:
11974 Py_DECREF(new);
11975 return NULL;
11976}
11977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011978PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980\n\
11981Return a copy of the string S, where all characters have been mapped\n\
11982through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011983Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011984Unmapped characters are left untouched. Characters mapped to None\n\
11985are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
11987static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991}
11992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011993PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011996Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
11998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011999unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001 return fixup(self, fixupper);
12002}
12003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012004PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012007Pad a numeric string S with zeros on the left, to fill a field\n\
12008of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
12010static PyObject *
12011unicode_zfill(PyUnicodeObject *self, PyObject *args)
12012{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012013 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012015 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 int kind;
12017 void *data;
12018 Py_UCS4 chr;
12019
12020 if (PyUnicode_READY(self) == -1)
12021 return NULL;
12022
Martin v. Löwis18e16552006-02-15 17:27:45 +000012023 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 return NULL;
12025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012027 if (PyUnicode_CheckExact(self)) {
12028 Py_INCREF(self);
12029 return (PyObject*) self;
12030 }
12031 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012032 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033 }
12034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
12037 u = pad(self, fill, 0, '0');
12038
Walter Dörwald068325e2002-04-15 13:36:47 +000012039 if (u == NULL)
12040 return NULL;
12041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 kind = PyUnicode_KIND(u);
12043 data = PyUnicode_DATA(u);
12044 chr = PyUnicode_READ(kind, data, fill);
12045
12046 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 PyUnicode_WRITE(kind, data, 0, chr);
12049 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 }
12051
12052 return (PyObject*) u;
12053}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012056static PyObject *
12057unicode__decimal2ascii(PyObject *self)
12058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012060}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061#endif
12062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012063PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012066Return True if S starts with the specified prefix, False otherwise.\n\
12067With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012068With optional end, stop comparing S at that position.\n\
12069prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
12071static PyObject *
12072unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012075 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012077 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012078 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012079 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Jesus Ceaac451502011-04-20 17:09:23 +020012081 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012083 if (PyTuple_Check(subobj)) {
12084 Py_ssize_t i;
12085 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12086 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012088 if (substring == NULL)
12089 return NULL;
12090 result = tailmatch(self, substring, start, end, -1);
12091 Py_DECREF(substring);
12092 if (result) {
12093 Py_RETURN_TRUE;
12094 }
12095 }
12096 /* nothing matched */
12097 Py_RETURN_FALSE;
12098 }
12099 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012100 if (substring == NULL) {
12101 if (PyErr_ExceptionMatches(PyExc_TypeError))
12102 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12103 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012105 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012106 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012108 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109}
12110
12111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012112PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012115Return True if S ends with the specified suffix, False otherwise.\n\
12116With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012117With optional end, stop comparing S at that position.\n\
12118suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
12120static PyObject *
12121unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012122 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012124 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012126 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012127 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012128 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
Jesus Ceaac451502011-04-20 17:09:23 +020012130 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012132 if (PyTuple_Check(subobj)) {
12133 Py_ssize_t i;
12134 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12135 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012137 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012139 result = tailmatch(self, substring, start, end, +1);
12140 Py_DECREF(substring);
12141 if (result) {
12142 Py_RETURN_TRUE;
12143 }
12144 }
12145 Py_RETURN_FALSE;
12146 }
12147 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012148 if (substring == NULL) {
12149 if (PyErr_ExceptionMatches(PyExc_TypeError))
12150 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12151 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012153 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012154 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012156 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157}
12158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012160
12161PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012163\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012164Return a formatted version of S, using substitutions from args and kwargs.\n\
12165The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012166
Eric Smith27bbca62010-11-04 17:06:58 +000012167PyDoc_STRVAR(format_map__doc__,
12168 "S.format_map(mapping) -> str\n\
12169\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012170Return a formatted version of S, using substitutions from mapping.\n\
12171The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012172
Eric Smith4a7d76d2008-05-30 18:10:19 +000012173static PyObject *
12174unicode__format__(PyObject* self, PyObject* args)
12175{
12176 PyObject *format_spec;
12177
12178 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12179 return NULL;
12180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12182 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012183}
12184
Eric Smith8c663262007-08-25 02:26:07 +000012185PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012187\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012188Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012189
12190static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012191unicode__sizeof__(PyUnicodeObject *v)
12192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 Py_ssize_t size;
12194
12195 /* If it's a compact object, account for base structure +
12196 character data. */
12197 if (PyUnicode_IS_COMPACT_ASCII(v))
12198 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12199 else if (PyUnicode_IS_COMPACT(v))
12200 size = sizeof(PyCompactUnicodeObject) +
12201 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12202 else {
12203 /* If it is a two-block object, account for base object, and
12204 for character block if present. */
12205 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012206 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 size += (PyUnicode_GET_LENGTH(v) + 1) *
12208 PyUnicode_CHARACTER_SIZE(v);
12209 }
12210 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012211 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012212 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012214 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012215 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216
12217 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012218}
12219
12220PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012222
12223static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012224unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012225{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012226 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 if (!copy)
12228 return NULL;
12229 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012230}
12231
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232static PyMethodDef unicode_methods[] = {
12233
12234 /* Order is according to common usage: often used methods should
12235 appear first, since lookup is done sequentially. */
12236
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012237 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012238 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12239 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012240 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012241 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12242 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12243 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12244 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12245 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12246 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12247 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012248 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012249 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12250 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12251 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012252 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012253 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12254 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12255 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012256 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012258 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012259 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012260 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12261 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12262 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12263 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12264 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12265 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12266 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12267 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12268 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12269 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12270 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12271 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12272 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12273 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012274 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012275 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012276 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012277 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012278 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012279 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012280 {"maketrans", (PyCFunction) unicode_maketrans,
12281 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012282 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012283#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012284 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285#endif
12286
12287#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012288 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012289 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290#endif
12291
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 {NULL, NULL}
12294};
12295
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012296static PyObject *
12297unicode_mod(PyObject *v, PyObject *w)
12298{
Brian Curtindfc80e32011-08-10 20:28:54 -050012299 if (!PyUnicode_Check(v))
12300 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012302}
12303
12304static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 0, /*nb_add*/
12306 0, /*nb_subtract*/
12307 0, /*nb_multiply*/
12308 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012309};
12310
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312 (lenfunc) unicode_length, /* sq_length */
12313 PyUnicode_Concat, /* sq_concat */
12314 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12315 (ssizeargfunc) unicode_getitem, /* sq_item */
12316 0, /* sq_slice */
12317 0, /* sq_ass_item */
12318 0, /* sq_ass_slice */
12319 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320};
12321
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012322static PyObject*
12323unicode_subscript(PyUnicodeObject* self, PyObject* item)
12324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 if (PyUnicode_READY(self) == -1)
12326 return NULL;
12327
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012328 if (PyIndex_Check(item)) {
12329 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012330 if (i == -1 && PyErr_Occurred())
12331 return NULL;
12332 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012334 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012335 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012336 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012337 PyObject *result;
12338 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012339 int src_kind, dest_kind;
12340 Py_UCS4 ch, max_char;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012344 return NULL;
12345 }
12346
12347 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 return PyUnicode_New(0, 0);
12349 } else if (start == 0 && step == 1 &&
12350 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012351 PyUnicode_CheckExact(self)) {
12352 Py_INCREF(self);
12353 return (PyObject *)self;
12354 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012355 return PyUnicode_Substring((PyObject*)self,
12356 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012357 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012358 /* General case */
12359 max_char = 127;
12360 src_kind = PyUnicode_KIND(self);
12361 src_data = PyUnicode_DATA(self);
12362 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12363 ch = PyUnicode_READ(src_kind, src_data, cur);
12364 if (ch > max_char)
12365 max_char = ch;
12366 }
12367 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012368 if (result == NULL)
12369 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012370 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012371 dest_data = PyUnicode_DATA(result);
12372
12373 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012374 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12375 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012376 }
12377 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012378 } else {
12379 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12380 return NULL;
12381 }
12382}
12383
12384static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 (lenfunc)unicode_length, /* mp_length */
12386 (binaryfunc)unicode_subscript, /* mp_subscript */
12387 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012388};
12389
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391/* Helpers for PyUnicode_Format() */
12392
12393static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012394getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012396 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 (*p_argidx)++;
12399 if (arglen < 0)
12400 return args;
12401 else
12402 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403 }
12404 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406 return NULL;
12407}
12408
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012409/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012411static PyObject *
12412formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012414 char *p;
12415 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012417
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418 x = PyFloat_AsDouble(v);
12419 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012420 return NULL;
12421
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012424
Eric Smith0923d1d2009-04-16 20:16:10 +000012425 p = PyOS_double_to_string(x, type, prec,
12426 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012427 if (p == NULL)
12428 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012430 PyMem_Free(p);
12431 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432}
12433
Tim Peters38fd5b62000-09-21 05:43:11 +000012434static PyObject*
12435formatlong(PyObject *val, int flags, int prec, int type)
12436{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 char *buf;
12438 int len;
12439 PyObject *str; /* temporary string object. */
12440 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012441
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12443 if (!str)
12444 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012446 Py_DECREF(str);
12447 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012448}
12449
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012452 size_t buflen,
12453 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012455 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012456 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (PyUnicode_GET_LENGTH(v) == 1) {
12458 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 buf[1] = '\0';
12460 return 1;
12461 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 goto onError;
12463 }
12464 else {
12465 /* Integer input truncated to a character */
12466 long x;
12467 x = PyLong_AsLong(v);
12468 if (x == -1 && PyErr_Occurred())
12469 goto onError;
12470
12471 if (x < 0 || x > 0x10ffff) {
12472 PyErr_SetString(PyExc_OverflowError,
12473 "%c arg not in range(0x110000)");
12474 return -1;
12475 }
12476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 buf[1] = '\0';
12479 return 1;
12480 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012481
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012483 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012485 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012488/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012489 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012490*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012491#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012492
Alexander Belopolsky40018472011-02-26 01:02:56 +000012493PyObject *
12494PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 void *fmt;
12497 int fmtkind;
12498 PyObject *result;
12499 Py_UCS4 *res, *res0;
12500 Py_UCS4 max;
12501 int kind;
12502 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012506
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012508 PyErr_BadInternalCall();
12509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12512 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 fmt = PyUnicode_DATA(uformat);
12515 fmtkind = PyUnicode_KIND(uformat);
12516 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12517 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
12519 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12521 if (res0 == NULL) {
12522 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
12526 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 arglen = PyTuple_Size(args);
12528 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 }
12530 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 arglen = -1;
12532 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012534 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012535 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 if (--rescnt < 0) {
12541 rescnt = fmtcnt + 100;
12542 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12544 if (res0 == NULL){
12545 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012546 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 }
12548 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012552 }
12553 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 /* Got a format specifier */
12555 int flags = 0;
12556 Py_ssize_t width = -1;
12557 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 Py_UCS4 c = '\0';
12559 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 int isnumok;
12561 PyObject *v = NULL;
12562 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 void *pbuf;
12564 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 Py_ssize_t len, len1;
12567 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 fmtpos++;
12570 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12571 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 Py_ssize_t keylen;
12573 PyObject *key;
12574 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012575
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 if (dict == NULL) {
12577 PyErr_SetString(PyExc_TypeError,
12578 "format requires a mapping");
12579 goto onError;
12580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 /* Skip over balanced parentheses */
12585 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 if (fmtcnt < 0 || pcount > 0) {
12594 PyErr_SetString(PyExc_ValueError,
12595 "incomplete format key");
12596 goto onError;
12597 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012598 key = PyUnicode_Substring((PyObject*)uformat,
12599 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 if (key == NULL)
12601 goto onError;
12602 if (args_owned) {
12603 Py_DECREF(args);
12604 args_owned = 0;
12605 }
12606 args = PyObject_GetItem(dict, key);
12607 Py_DECREF(key);
12608 if (args == NULL) {
12609 goto onError;
12610 }
12611 args_owned = 1;
12612 arglen = -1;
12613 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 case '-': flags |= F_LJUST; continue;
12618 case '+': flags |= F_SIGN; continue;
12619 case ' ': flags |= F_BLANK; continue;
12620 case '#': flags |= F_ALT; continue;
12621 case '0': flags |= F_ZERO; continue;
12622 }
12623 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012624 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 if (c == '*') {
12626 v = getnextarg(args, arglen, &argidx);
12627 if (v == NULL)
12628 goto onError;
12629 if (!PyLong_Check(v)) {
12630 PyErr_SetString(PyExc_TypeError,
12631 "* wants int");
12632 goto onError;
12633 }
12634 width = PyLong_AsLong(v);
12635 if (width == -1 && PyErr_Occurred())
12636 goto onError;
12637 if (width < 0) {
12638 flags |= F_LJUST;
12639 width = -width;
12640 }
12641 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 }
12644 else if (c >= '0' && c <= '9') {
12645 width = c - '0';
12646 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 if (c < '0' || c > '9')
12649 break;
12650 if ((width*10) / 10 != width) {
12651 PyErr_SetString(PyExc_ValueError,
12652 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012653 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 }
12655 width = width*10 + (c - '0');
12656 }
12657 }
12658 if (c == '.') {
12659 prec = 0;
12660 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 if (c == '*') {
12663 v = getnextarg(args, arglen, &argidx);
12664 if (v == NULL)
12665 goto onError;
12666 if (!PyLong_Check(v)) {
12667 PyErr_SetString(PyExc_TypeError,
12668 "* wants int");
12669 goto onError;
12670 }
12671 prec = PyLong_AsLong(v);
12672 if (prec == -1 && PyErr_Occurred())
12673 goto onError;
12674 if (prec < 0)
12675 prec = 0;
12676 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 }
12679 else if (c >= '0' && c <= '9') {
12680 prec = c - '0';
12681 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 if (c < '0' || c > '9')
12684 break;
12685 if ((prec*10) / 10 != prec) {
12686 PyErr_SetString(PyExc_ValueError,
12687 "prec too big");
12688 goto onError;
12689 }
12690 prec = prec*10 + (c - '0');
12691 }
12692 }
12693 } /* prec */
12694 if (fmtcnt >= 0) {
12695 if (c == 'h' || c == 'l' || c == 'L') {
12696 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 }
12699 }
12700 if (fmtcnt < 0) {
12701 PyErr_SetString(PyExc_ValueError,
12702 "incomplete format");
12703 goto onError;
12704 }
12705 if (c != '%') {
12706 v = getnextarg(args, arglen, &argidx);
12707 if (v == NULL)
12708 goto onError;
12709 }
12710 sign = 0;
12711 fill = ' ';
12712 switch (c) {
12713
12714 case '%':
12715 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 len = 1;
12720 break;
12721
12722 case 's':
12723 case 'r':
12724 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012725 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 temp = v;
12727 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 }
12729 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 if (c == 's')
12731 temp = PyObject_Str(v);
12732 else if (c == 'r')
12733 temp = PyObject_Repr(v);
12734 else
12735 temp = PyObject_ASCII(v);
12736 if (temp == NULL)
12737 goto onError;
12738 if (PyUnicode_Check(temp))
12739 /* nothing to do */;
12740 else {
12741 Py_DECREF(temp);
12742 PyErr_SetString(PyExc_TypeError,
12743 "%s argument has non-string str()");
12744 goto onError;
12745 }
12746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 if (PyUnicode_READY(temp) == -1) {
12748 Py_CLEAR(temp);
12749 goto onError;
12750 }
12751 pbuf = PyUnicode_DATA(temp);
12752 kind = PyUnicode_KIND(temp);
12753 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 if (prec >= 0 && len > prec)
12755 len = prec;
12756 break;
12757
12758 case 'i':
12759 case 'd':
12760 case 'u':
12761 case 'o':
12762 case 'x':
12763 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 isnumok = 0;
12765 if (PyNumber_Check(v)) {
12766 PyObject *iobj=NULL;
12767
12768 if (PyLong_Check(v)) {
12769 iobj = v;
12770 Py_INCREF(iobj);
12771 }
12772 else {
12773 iobj = PyNumber_Long(v);
12774 }
12775 if (iobj!=NULL) {
12776 if (PyLong_Check(iobj)) {
12777 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012778 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 Py_DECREF(iobj);
12780 if (!temp)
12781 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 if (PyUnicode_READY(temp) == -1) {
12783 Py_CLEAR(temp);
12784 goto onError;
12785 }
12786 pbuf = PyUnicode_DATA(temp);
12787 kind = PyUnicode_KIND(temp);
12788 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 sign = 1;
12790 }
12791 else {
12792 Py_DECREF(iobj);
12793 }
12794 }
12795 }
12796 if (!isnumok) {
12797 PyErr_Format(PyExc_TypeError,
12798 "%%%c format: a number is required, "
12799 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12800 goto onError;
12801 }
12802 if (flags & F_ZERO)
12803 fill = '0';
12804 break;
12805
12806 case 'e':
12807 case 'E':
12808 case 'f':
12809 case 'F':
12810 case 'g':
12811 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012812 temp = formatfloat(v, flags, prec, c);
12813 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 if (PyUnicode_READY(temp) == -1) {
12816 Py_CLEAR(temp);
12817 goto onError;
12818 }
12819 pbuf = PyUnicode_DATA(temp);
12820 kind = PyUnicode_KIND(temp);
12821 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 sign = 1;
12823 if (flags & F_ZERO)
12824 fill = '0';
12825 break;
12826
12827 case 'c':
12828 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012830 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 if (len < 0)
12832 goto onError;
12833 break;
12834
12835 default:
12836 PyErr_Format(PyExc_ValueError,
12837 "unsupported format character '%c' (0x%x) "
12838 "at index %zd",
12839 (31<=c && c<=126) ? (char)c : '?',
12840 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 goto onError;
12843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 /* pbuf is initialized here. */
12845 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12848 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12849 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 len--;
12851 }
12852 else if (flags & F_SIGN)
12853 sign = '+';
12854 else if (flags & F_BLANK)
12855 sign = ' ';
12856 else
12857 sign = 0;
12858 }
12859 if (width < len)
12860 width = len;
12861 if (rescnt - (sign != 0) < width) {
12862 reslen -= rescnt;
12863 rescnt = width + fmtcnt + 100;
12864 reslen += rescnt;
12865 if (reslen < 0) {
12866 Py_XDECREF(temp);
12867 PyErr_NoMemory();
12868 goto onError;
12869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12871 if (res0 == 0) {
12872 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 Py_XDECREF(temp);
12874 goto onError;
12875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 }
12878 if (sign) {
12879 if (fill != ' ')
12880 *res++ = sign;
12881 rescnt--;
12882 if (width > len)
12883 width--;
12884 }
12885 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12887 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12890 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012891 }
12892 rescnt -= 2;
12893 width -= 2;
12894 if (width < 0)
12895 width = 0;
12896 len -= 2;
12897 }
12898 if (width > len && !(flags & F_LJUST)) {
12899 do {
12900 --rescnt;
12901 *res++ = fill;
12902 } while (--width > len);
12903 }
12904 if (fill == ' ') {
12905 if (sign)
12906 *res++ = sign;
12907 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12909 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12910 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12911 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012912 }
12913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 /* Copy all characters, preserving len */
12915 len1 = len;
12916 while (len1--) {
12917 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12918 rescnt--;
12919 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 while (--width >= len) {
12921 --rescnt;
12922 *res++ = ' ';
12923 }
12924 if (dict && (argidx < arglen) && c != '%') {
12925 PyErr_SetString(PyExc_TypeError,
12926 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012927 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 goto onError;
12929 }
12930 Py_XDECREF(temp);
12931 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932 } /* until end */
12933 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 PyErr_SetString(PyExc_TypeError,
12935 "not all arguments converted during string formatting");
12936 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937 }
12938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939
12940 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12941 if (*res > max)
12942 max = *res;
12943 result = PyUnicode_New(reslen - rescnt, max);
12944 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012945 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 kind = PyUnicode_KIND(result);
12947 for (res = res0; res < res0+reslen-rescnt; res++)
12948 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12949 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952 }
12953 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954 return (PyObject *)result;
12955
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958 Py_DECREF(uformat);
12959 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961 }
12962 return NULL;
12963}
12964
Jeremy Hylton938ace62002-07-17 16:30:39 +000012965static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012966unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12967
Tim Peters6d6c1a32001-08-02 04:15:00 +000012968static PyObject *
12969unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12970{
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012972 static char *kwlist[] = {"object", "encoding", "errors", 0};
12973 char *encoding = NULL;
12974 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012975
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 if (type != &PyUnicode_Type)
12977 return unicode_subtype_new(type, args, kwds);
12978 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012979 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 return NULL;
12981 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 if (encoding == NULL && errors == NULL)
12984 return PyObject_Str(x);
12985 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012987}
12988
Guido van Rossume023fe02001-08-30 03:12:59 +000012989static PyObject *
12990unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12991{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012992 PyUnicodeObject *unicode, *self;
12993 Py_ssize_t length, char_size;
12994 int share_wstr, share_utf8;
12995 unsigned int kind;
12996 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012997
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012999
13000 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13001 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013002 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013003 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013004 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013005 return NULL;
13006
13007 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13008 if (self == NULL) {
13009 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010 return NULL;
13011 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013012 kind = PyUnicode_KIND(unicode);
13013 length = PyUnicode_GET_LENGTH(unicode);
13014
13015 _PyUnicode_LENGTH(self) = length;
13016 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13017 _PyUnicode_STATE(self).interned = 0;
13018 _PyUnicode_STATE(self).kind = kind;
13019 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013020 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013021 _PyUnicode_STATE(self).ready = 1;
13022 _PyUnicode_WSTR(self) = NULL;
13023 _PyUnicode_UTF8_LENGTH(self) = 0;
13024 _PyUnicode_UTF8(self) = NULL;
13025 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013026 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013027
13028 share_utf8 = 0;
13029 share_wstr = 0;
13030 if (kind == PyUnicode_1BYTE_KIND) {
13031 char_size = 1;
13032 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13033 share_utf8 = 1;
13034 }
13035 else if (kind == PyUnicode_2BYTE_KIND) {
13036 char_size = 2;
13037 if (sizeof(wchar_t) == 2)
13038 share_wstr = 1;
13039 }
13040 else {
13041 assert(kind == PyUnicode_4BYTE_KIND);
13042 char_size = 4;
13043 if (sizeof(wchar_t) == 4)
13044 share_wstr = 1;
13045 }
13046
13047 /* Ensure we won't overflow the length. */
13048 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13049 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013051 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013052 data = PyObject_MALLOC((length + 1) * char_size);
13053 if (data == NULL) {
13054 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 goto onError;
13056 }
13057
Victor Stinnerc3c74152011-10-02 20:39:55 +020013058 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013059 if (share_utf8) {
13060 _PyUnicode_UTF8_LENGTH(self) = length;
13061 _PyUnicode_UTF8(self) = data;
13062 }
13063 if (share_wstr) {
13064 _PyUnicode_WSTR_LENGTH(self) = length;
13065 _PyUnicode_WSTR(self) = (wchar_t *)data;
13066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013068 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13069 PyUnicode_KIND_SIZE(kind, length + 1));
13070 Py_DECREF(unicode);
13071 return (PyObject *)self;
13072
13073onError:
13074 Py_DECREF(unicode);
13075 Py_DECREF(self);
13076 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013077}
13078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013079PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013080 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013081\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013082Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013083encoding defaults to the current default string encoding.\n\
13084errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013085
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013086static PyObject *unicode_iter(PyObject *seq);
13087
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013089 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013090 "str", /* tp_name */
13091 sizeof(PyUnicodeObject), /* tp_size */
13092 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013094 (destructor)unicode_dealloc, /* tp_dealloc */
13095 0, /* tp_print */
13096 0, /* tp_getattr */
13097 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013098 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013099 unicode_repr, /* tp_repr */
13100 &unicode_as_number, /* tp_as_number */
13101 &unicode_as_sequence, /* tp_as_sequence */
13102 &unicode_as_mapping, /* tp_as_mapping */
13103 (hashfunc) unicode_hash, /* tp_hash*/
13104 0, /* tp_call*/
13105 (reprfunc) unicode_str, /* tp_str */
13106 PyObject_GenericGetAttr, /* tp_getattro */
13107 0, /* tp_setattro */
13108 0, /* tp_as_buffer */
13109 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013111 unicode_doc, /* tp_doc */
13112 0, /* tp_traverse */
13113 0, /* tp_clear */
13114 PyUnicode_RichCompare, /* tp_richcompare */
13115 0, /* tp_weaklistoffset */
13116 unicode_iter, /* tp_iter */
13117 0, /* tp_iternext */
13118 unicode_methods, /* tp_methods */
13119 0, /* tp_members */
13120 0, /* tp_getset */
13121 &PyBaseObject_Type, /* tp_base */
13122 0, /* tp_dict */
13123 0, /* tp_descr_get */
13124 0, /* tp_descr_set */
13125 0, /* tp_dictoffset */
13126 0, /* tp_init */
13127 0, /* tp_alloc */
13128 unicode_new, /* tp_new */
13129 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130};
13131
13132/* Initialize the Unicode implementation */
13133
Thomas Wouters78890102000-07-22 19:25:51 +000013134void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013136 int i;
13137
Thomas Wouters477c8d52006-05-27 19:21:47 +000013138 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013140 0x000A, /* LINE FEED */
13141 0x000D, /* CARRIAGE RETURN */
13142 0x001C, /* FILE SEPARATOR */
13143 0x001D, /* GROUP SEPARATOR */
13144 0x001E, /* RECORD SEPARATOR */
13145 0x0085, /* NEXT LINE */
13146 0x2028, /* LINE SEPARATOR */
13147 0x2029, /* PARAGRAPH SEPARATOR */
13148 };
13149
Fred Drakee4315f52000-05-09 19:53:39 +000013150 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013151 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013155 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013157 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013159
13160 /* initialize the linebreak bloom filter */
13161 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013163 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164
13165 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166}
13167
13168/* Finalize the Unicode implementation */
13169
Christian Heimesa156e092008-02-16 07:38:31 +000013170int
13171PyUnicode_ClearFreeList(void)
13172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013174}
13175
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176void
Thomas Wouters78890102000-07-22 19:25:51 +000013177_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013179 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013181 Py_XDECREF(unicode_empty);
13182 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013184 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013185 if (unicode_latin1[i]) {
13186 Py_DECREF(unicode_latin1[i]);
13187 unicode_latin1[i] = NULL;
13188 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013189 }
Christian Heimesa156e092008-02-16 07:38:31 +000013190 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013192
Walter Dörwald16807132007-05-25 13:52:07 +000013193void
13194PyUnicode_InternInPlace(PyObject **p)
13195{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013196 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13197 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013198#ifdef Py_DEBUG
13199 assert(s != NULL);
13200 assert(_PyUnicode_CHECK(s));
13201#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013203 return;
13204#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 /* If it's a subclass, we don't really know what putting
13206 it in the interned dict might do. */
13207 if (!PyUnicode_CheckExact(s))
13208 return;
13209 if (PyUnicode_CHECK_INTERNED(s))
13210 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013211 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013212 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 return;
13214 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013215 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013216 if (interned == NULL) {
13217 interned = PyDict_New();
13218 if (interned == NULL) {
13219 PyErr_Clear(); /* Don't leave an exception */
13220 return;
13221 }
13222 }
13223 /* It might be that the GetItem call fails even
13224 though the key is present in the dictionary,
13225 namely when this happens during a stack overflow. */
13226 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013228 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013229
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 if (t) {
13231 Py_INCREF(t);
13232 Py_DECREF(*p);
13233 *p = t;
13234 return;
13235 }
Walter Dörwald16807132007-05-25 13:52:07 +000013236
Benjamin Peterson14339b62009-01-31 16:36:08 +000013237 PyThreadState_GET()->recursion_critical = 1;
13238 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13239 PyErr_Clear();
13240 PyThreadState_GET()->recursion_critical = 0;
13241 return;
13242 }
13243 PyThreadState_GET()->recursion_critical = 0;
13244 /* The two references in interned are not counted by refcnt.
13245 The deallocator will take care of this */
13246 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013247 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013248}
13249
13250void
13251PyUnicode_InternImmortal(PyObject **p)
13252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13254
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 PyUnicode_InternInPlace(p);
13256 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 Py_INCREF(*p);
13259 }
Walter Dörwald16807132007-05-25 13:52:07 +000013260}
13261
13262PyObject *
13263PyUnicode_InternFromString(const char *cp)
13264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265 PyObject *s = PyUnicode_FromString(cp);
13266 if (s == NULL)
13267 return NULL;
13268 PyUnicode_InternInPlace(&s);
13269 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013270}
13271
Alexander Belopolsky40018472011-02-26 01:02:56 +000013272void
13273_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 PyObject *keys;
13276 PyUnicodeObject *s;
13277 Py_ssize_t i, n;
13278 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013279
Benjamin Peterson14339b62009-01-31 16:36:08 +000013280 if (interned == NULL || !PyDict_Check(interned))
13281 return;
13282 keys = PyDict_Keys(interned);
13283 if (keys == NULL || !PyList_Check(keys)) {
13284 PyErr_Clear();
13285 return;
13286 }
Walter Dörwald16807132007-05-25 13:52:07 +000013287
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13289 detector, interned unicode strings are not forcibly deallocated;
13290 rather, we give them their stolen references back, and then clear
13291 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013292
Benjamin Peterson14339b62009-01-31 16:36:08 +000013293 n = PyList_GET_SIZE(keys);
13294 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 for (i = 0; i < n; i++) {
13297 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013298 if (PyUnicode_READY(s) == -1) {
13299 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 case SSTATE_NOT_INTERNED:
13304 /* XXX Shouldn't happen */
13305 break;
13306 case SSTATE_INTERNED_IMMORTAL:
13307 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 break;
13310 case SSTATE_INTERNED_MORTAL:
13311 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013313 break;
13314 default:
13315 Py_FatalError("Inconsistent interned string state.");
13316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013318 }
13319 fprintf(stderr, "total size of all interned strings: "
13320 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13321 "mortal/immortal\n", mortal_size, immortal_size);
13322 Py_DECREF(keys);
13323 PyDict_Clear(interned);
13324 Py_DECREF(interned);
13325 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013326}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013327
13328
13329/********************* Unicode Iterator **************************/
13330
13331typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013332 PyObject_HEAD
13333 Py_ssize_t it_index;
13334 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013335} unicodeiterobject;
13336
13337static void
13338unicodeiter_dealloc(unicodeiterobject *it)
13339{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 _PyObject_GC_UNTRACK(it);
13341 Py_XDECREF(it->it_seq);
13342 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013343}
13344
13345static int
13346unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 Py_VISIT(it->it_seq);
13349 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013350}
13351
13352static PyObject *
13353unicodeiter_next(unicodeiterobject *it)
13354{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 PyUnicodeObject *seq;
13356 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013357
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 assert(it != NULL);
13359 seq = it->it_seq;
13360 if (seq == NULL)
13361 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013362 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013364 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13365 int kind = PyUnicode_KIND(seq);
13366 void *data = PyUnicode_DATA(seq);
13367 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13368 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 if (item != NULL)
13370 ++it->it_index;
13371 return item;
13372 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013373
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 Py_DECREF(seq);
13375 it->it_seq = NULL;
13376 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013377}
13378
13379static PyObject *
13380unicodeiter_len(unicodeiterobject *it)
13381{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 Py_ssize_t len = 0;
13383 if (it->it_seq)
13384 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13385 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013386}
13387
13388PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13389
13390static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013394};
13395
13396PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13398 "str_iterator", /* tp_name */
13399 sizeof(unicodeiterobject), /* tp_basicsize */
13400 0, /* tp_itemsize */
13401 /* methods */
13402 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13403 0, /* tp_print */
13404 0, /* tp_getattr */
13405 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013406 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013407 0, /* tp_repr */
13408 0, /* tp_as_number */
13409 0, /* tp_as_sequence */
13410 0, /* tp_as_mapping */
13411 0, /* tp_hash */
13412 0, /* tp_call */
13413 0, /* tp_str */
13414 PyObject_GenericGetAttr, /* tp_getattro */
13415 0, /* tp_setattro */
13416 0, /* tp_as_buffer */
13417 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13418 0, /* tp_doc */
13419 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13420 0, /* tp_clear */
13421 0, /* tp_richcompare */
13422 0, /* tp_weaklistoffset */
13423 PyObject_SelfIter, /* tp_iter */
13424 (iternextfunc)unicodeiter_next, /* tp_iternext */
13425 unicodeiter_methods, /* tp_methods */
13426 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013427};
13428
13429static PyObject *
13430unicode_iter(PyObject *seq)
13431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013432 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013433
Benjamin Peterson14339b62009-01-31 16:36:08 +000013434 if (!PyUnicode_Check(seq)) {
13435 PyErr_BadInternalCall();
13436 return NULL;
13437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 if (PyUnicode_READY(seq) == -1)
13439 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013440 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13441 if (it == NULL)
13442 return NULL;
13443 it->it_index = 0;
13444 Py_INCREF(seq);
13445 it->it_seq = (PyUnicodeObject *)seq;
13446 _PyObject_GC_TRACK(it);
13447 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013448}
13449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450#define UNIOP(x) Py_UNICODE_##x
13451#define UNIOP_t Py_UNICODE
13452#include "uniops.h"
13453#undef UNIOP
13454#undef UNIOP_t
13455#define UNIOP(x) Py_UCS4_##x
13456#define UNIOP_t Py_UCS4
13457#include "uniops.h"
13458#undef UNIOP
13459#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013460
Victor Stinner71133ff2010-09-01 23:43:53 +000013461Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013462PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013463{
13464 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13465 Py_UNICODE *copy;
13466 Py_ssize_t size;
13467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 if (!PyUnicode_Check(unicode)) {
13469 PyErr_BadArgument();
13470 return NULL;
13471 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013472 /* Ensure we won't overflow the size. */
13473 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13474 PyErr_NoMemory();
13475 return NULL;
13476 }
13477 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13478 size *= sizeof(Py_UNICODE);
13479 copy = PyMem_Malloc(size);
13480 if (copy == NULL) {
13481 PyErr_NoMemory();
13482 return NULL;
13483 }
13484 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13485 return copy;
13486}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013487
Georg Brandl66c221e2010-10-14 07:04:07 +000013488/* A _string module, to export formatter_parser and formatter_field_name_split
13489 to the string.Formatter class implemented in Python. */
13490
13491static PyMethodDef _string_methods[] = {
13492 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13493 METH_O, PyDoc_STR("split the argument as a field name")},
13494 {"formatter_parser", (PyCFunction) formatter_parser,
13495 METH_O, PyDoc_STR("parse the argument as a format string")},
13496 {NULL, NULL}
13497};
13498
13499static struct PyModuleDef _string_module = {
13500 PyModuleDef_HEAD_INIT,
13501 "_string",
13502 PyDoc_STR("string helper module"),
13503 0,
13504 _string_methods,
13505 NULL,
13506 NULL,
13507 NULL,
13508 NULL
13509};
13510
13511PyMODINIT_FUNC
13512PyInit__string(void)
13513{
13514 return PyModule_Create(&_string_module);
13515}
13516
13517
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013518#ifdef __cplusplus
13519}
13520#endif