blob: 93459a7223e4a5993cc1553ce389f8e6e507b524 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200972 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001016 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001018 /* slow path to check for character overflow */
1019 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1020 Py_UCS4 ch, maxchar;
1021 Py_ssize_t i;
1022
1023 maxchar = 0;
1024 invalid_kinds = 0;
1025 for (i=0; i < how_many; i++) {
1026 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1027 if (ch > maxchar) {
1028 maxchar = ch;
1029 if (maxchar > to_maxchar) {
1030 invalid_kinds = 1;
1031 break;
1032 }
1033 }
1034 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1035 }
1036 }
1037 else
1038 invalid_kinds = 1;
1039 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001040 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001041 "Cannot copy %s characters "
1042 "into a string of %s characters",
1043 unicode_kind_name(from),
1044 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001045 return -1;
1046 }
1047 }
1048 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049}
1050
Victor Stinner17222162011-09-28 22:15:37 +02001051/* Find the maximum code point and count the number of surrogate pairs so a
1052 correct string length can be computed before converting a string to UCS4.
1053 This function counts single surrogates as a character and not as a pair.
1054
1055 Return 0 on success, or -1 on error. */
1056static int
1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1058 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059{
1060 const wchar_t *iter;
1061
Victor Stinnerc53be962011-10-02 21:33:54 +02001062 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 *num_surrogates = 0;
1064 *maxchar = 0;
1065
1066 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001067 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001069#if SIZEOF_WCHAR_T != 2
1070 if (*maxchar >= 0x10000)
1071 return 0;
1072#endif
1073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074#if SIZEOF_WCHAR_T == 2
1075 if (*iter >= 0xD800 && *iter <= 0xDBFF
1076 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1077 {
1078 Py_UCS4 surrogate_val;
1079 surrogate_val = (((iter[0] & 0x3FF)<<10)
1080 | (iter[1] & 0x3FF)) + 0x10000;
1081 ++(*num_surrogates);
1082 if (surrogate_val > *maxchar)
1083 *maxchar = surrogate_val;
1084 iter += 2;
1085 }
1086 else
1087 iter++;
1088#else
1089 iter++;
1090#endif
1091 }
1092 return 0;
1093}
1094
1095#ifdef Py_DEBUG
1096int unicode_ready_calls = 0;
1097#endif
1098
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001099static int
1100unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001102 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 wchar_t *end;
1104 Py_UCS4 maxchar = 0;
1105 Py_ssize_t num_surrogates;
1106#if SIZEOF_WCHAR_T == 2
1107 Py_ssize_t length_wo_surrogates;
1108#endif
1109
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001110 assert(p_obj != NULL);
1111 unicode = (PyUnicodeObject *)*p_obj;
1112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001114 strings were created using _PyObject_New() and where no canonical
1115 representation (the str field) has been set yet aka strings
1116 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001117 assert(_PyUnicode_CHECK(unicode));
1118 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001120 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001122 /* Actually, it should neither be interned nor be anything else: */
1123 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
1125#ifdef Py_DEBUG
1126 ++unicode_ready_calls;
1127#endif
1128
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001129#ifdef Py_DEBUG
1130 assert(!replace || Py_REFCNT(unicode) == 1);
1131#else
1132 if (replace && Py_REFCNT(unicode) != 1)
1133 replace = 0;
1134#endif
1135 if (replace) {
1136 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1137 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1138 /* Optimization for empty strings */
1139 if (len == 0) {
1140 Py_INCREF(unicode_empty);
1141 Py_DECREF(*p_obj);
1142 *p_obj = unicode_empty;
1143 return 0;
1144 }
1145 if (len == 1 && wstr[0] < 256) {
1146 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1147 if (latin1_char == NULL)
1148 return -1;
1149 Py_DECREF(*p_obj);
1150 *p_obj = latin1_char;
1151 return 0;
1152 }
1153 }
1154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001156 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001157 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
1160 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001161 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1162 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001166 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 _PyUnicode_WSTR(unicode), end,
1168 PyUnicode_1BYTE_DATA(unicode));
1169 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1170 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1171 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1172 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001174 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001175 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 }
1177 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001178 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8(unicode) = NULL;
1180 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 }
1182 PyObject_FREE(_PyUnicode_WSTR(unicode));
1183 _PyUnicode_WSTR(unicode) = NULL;
1184 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1185 }
1186 /* In this case we might have to convert down from 4-byte native
1187 wchar_t to 2-byte unicode. */
1188 else if (maxchar < 65536) {
1189 assert(num_surrogates == 0 &&
1190 "FindMaxCharAndNumSurrogatePairs() messed up");
1191
Victor Stinner506f5922011-09-28 22:34:18 +02001192#if SIZEOF_WCHAR_T == 2
1193 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001194 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001195 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1196 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1197 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8(unicode) = NULL;
1199 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001200#else
1201 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001202 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001203 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001204 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001205 PyErr_NoMemory();
1206 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
Victor Stinner506f5922011-09-28 22:34:18 +02001208 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1209 _PyUnicode_WSTR(unicode), end,
1210 PyUnicode_2BYTE_DATA(unicode));
1211 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1212 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1213 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001214 _PyUnicode_UTF8(unicode) = NULL;
1215 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001216 PyObject_FREE(_PyUnicode_WSTR(unicode));
1217 _PyUnicode_WSTR(unicode) = NULL;
1218 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1219#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 }
1221 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1222 else {
1223#if SIZEOF_WCHAR_T == 2
1224 /* in case the native representation is 2-bytes, we need to allocate a
1225 new normalized 4-byte version. */
1226 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1228 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 PyErr_NoMemory();
1230 return -1;
1231 }
1232 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1233 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001234 _PyUnicode_UTF8(unicode) = NULL;
1235 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001236 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1237 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001238 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#else
1243 assert(num_surrogates == 0);
1244
Victor Stinnerc3c74152011-10-02 20:39:55 +02001245 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001247 _PyUnicode_UTF8(unicode) = NULL;
1248 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1250#endif
1251 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1252 }
1253 _PyUnicode_STATE(unicode).ready = 1;
1254 return 0;
1255}
1256
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001257int
1258_PyUnicode_ReadyReplace(PyObject **op)
1259{
1260 return unicode_ready(op, 1);
1261}
1262
1263int
1264_PyUnicode_Ready(PyObject *op)
1265{
1266 return unicode_ready(&op, 0);
1267}
1268
Alexander Belopolsky40018472011-02-26 01:02:56 +00001269static void
1270unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271{
Walter Dörwald16807132007-05-25 13:52:07 +00001272 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001273 case SSTATE_NOT_INTERNED:
1274 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001275
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 case SSTATE_INTERNED_MORTAL:
1277 /* revive dead object temporarily for DelItem */
1278 Py_REFCNT(unicode) = 3;
1279 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1280 Py_FatalError(
1281 "deletion of interned string failed");
1282 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001283
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 case SSTATE_INTERNED_IMMORTAL:
1285 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001286
Benjamin Peterson29060642009-01-31 22:14:21 +00001287 default:
1288 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001289 }
1290
Victor Stinner03490912011-10-03 23:45:12 +02001291 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001293 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001294 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295
1296 if (PyUnicode_IS_COMPACT(unicode)) {
1297 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 }
1299 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001300 if (_PyUnicode_DATA_ANY(unicode))
1301 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 }
1304}
1305
Alexander Belopolsky40018472011-02-26 01:02:56 +00001306static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001307unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001308{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001309 if (Py_REFCNT(unicode) != 1)
1310 return 0;
1311 if (PyUnicode_CHECK_INTERNED(unicode))
1312 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001313 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001314#ifdef Py_DEBUG
1315 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1316 && PyUnicode_GET_LENGTH(unicode) == 1)
1317 {
1318 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001319 if (ch < 256 && unicode_latin1[ch] == unicode)
1320 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001321 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001322#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 return 1;
1324}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325
Victor Stinnerfe226c02011-10-03 03:52:20 +02001326static int
1327unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1328{
1329 PyObject *unicode;
1330 Py_ssize_t old_length;
1331
1332 assert(p_unicode != NULL);
1333 unicode = *p_unicode;
1334
1335 assert(unicode != NULL);
1336 assert(PyUnicode_Check(unicode));
1337 assert(0 <= length);
1338
Victor Stinner910337b2011-10-03 03:20:16 +02001339 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001340 old_length = PyUnicode_WSTR_LENGTH(unicode);
1341 else
1342 old_length = PyUnicode_GET_LENGTH(unicode);
1343 if (old_length == length)
1344 return 0;
1345
Victor Stinnerfe226c02011-10-03 03:52:20 +02001346 if (!unicode_resizable(unicode)) {
1347 PyObject *copy = resize_copy(unicode, length);
1348 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001349 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 Py_DECREF(*p_unicode);
1351 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001353 }
1354
Victor Stinnerfe226c02011-10-03 03:52:20 +02001355 if (PyUnicode_IS_COMPACT(unicode)) {
1356 *p_unicode = resize_compact(unicode, length);
1357 if (*p_unicode == NULL)
1358 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001359 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001360 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001361 }
1362 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001363}
1364
Alexander Belopolsky40018472011-02-26 01:02:56 +00001365int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001366PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001367{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001368 PyObject *unicode;
1369 if (p_unicode == NULL) {
1370 PyErr_BadInternalCall();
1371 return -1;
1372 }
1373 unicode = *p_unicode;
1374 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1375 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1376 {
1377 PyErr_BadInternalCall();
1378 return -1;
1379 }
1380 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001381}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383static PyObject*
1384get_latin1_char(unsigned char ch)
1385{
Victor Stinnera464fc12011-10-02 20:39:30 +02001386 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001388 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 if (!unicode)
1390 return NULL;
1391 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1392 unicode_latin1[ch] = unicode;
1393 }
1394 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001395 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396}
1397
Alexander Belopolsky40018472011-02-26 01:02:56 +00001398PyObject *
1399PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400{
1401 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 Py_UCS4 maxchar = 0;
1403 Py_ssize_t num_surrogates;
1404
1405 if (u == NULL)
1406 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001408 /* If the Unicode data is known at construction time, we can apply
1409 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 /* Optimization for empty strings */
1412 if (size == 0 && unicode_empty != NULL) {
1413 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001414 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001415 }
Tim Petersced69f82003-09-16 20:30:58 +00001416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 /* Single character Unicode objects in the Latin-1 range are
1418 shared when using this constructor */
1419 if (size == 1 && *u < 256)
1420 return get_latin1_char((unsigned char)*u);
1421
1422 /* If not empty and not single character, copy the Unicode data
1423 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001424 if (find_maxchar_surrogates(u, u + size,
1425 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 return NULL;
1427
1428 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1429 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 if (!unicode)
1431 return NULL;
1432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 switch (PyUnicode_KIND(unicode)) {
1434 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001435 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1437 break;
1438 case PyUnicode_2BYTE_KIND:
1439#if Py_UNICODE_SIZE == 2
1440 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1441#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001442 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1444#endif
1445 break;
1446 case PyUnicode_4BYTE_KIND:
1447#if SIZEOF_WCHAR_T == 2
1448 /* This is the only case which has to process surrogates, thus
1449 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001450 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451#else
1452 assert(num_surrogates == 0);
1453 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1454#endif
1455 break;
1456 default:
1457 assert(0 && "Impossible state");
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
1460 return (PyObject *)unicode;
1461}
1462
Alexander Belopolsky40018472011-02-26 01:02:56 +00001463PyObject *
1464PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001465{
1466 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001467
Benjamin Peterson14339b62009-01-31 16:36:08 +00001468 if (size < 0) {
1469 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001471 return NULL;
1472 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001473
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001474 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001475 some optimizations which share commonly used objects.
1476 Also, this means the input must be UTF-8, so fall back to the
1477 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 if (u != NULL) {
1479
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 /* Optimization for empty strings */
1481 if (size == 0 && unicode_empty != NULL) {
1482 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001483 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001485
1486 /* Single characters are shared when using this constructor.
1487 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (size == 1 && Py_CHARMASK(*u) < 128)
1489 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001490
1491 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001492 }
1493
Walter Dörwald55507312007-05-18 13:12:10 +00001494 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001495 if (!unicode)
1496 return NULL;
1497
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001498 return (PyObject *)unicode;
1499}
1500
Alexander Belopolsky40018472011-02-26 01:02:56 +00001501PyObject *
1502PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001503{
1504 size_t size = strlen(u);
1505 if (size > PY_SSIZE_T_MAX) {
1506 PyErr_SetString(PyExc_OverflowError, "input too long");
1507 return NULL;
1508 }
1509
1510 return PyUnicode_FromStringAndSize(u, size);
1511}
1512
Victor Stinnere57b1c02011-09-28 22:20:48 +02001513static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001514unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1515{
1516 PyObject *res = PyUnicode_New(size, 127);
1517 if (!res)
1518 return NULL;
1519 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1520 return res;
1521}
1522
1523static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001524_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001527 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001529
1530 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 for (i = 0; i < size; i++) {
1532 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001533 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001535 }
1536 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001537 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 if (!res)
1539 return NULL;
1540 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1541 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001542}
1543
Victor Stinnere57b1c02011-09-28 22:20:48 +02001544static PyObject*
1545_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546{
1547 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001548 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001550
1551 assert(size >= 0);
1552 for (i = 0; i < size; i++) {
1553 if (u[i] > max_char) {
1554 max_char = u[i];
1555 if (max_char >= 256)
1556 break;
1557 }
1558 }
1559 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 if (!res)
1561 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001562 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1564 else
1565 for (i = 0; i < size; i++)
1566 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1567 return res;
1568}
1569
Victor Stinnere57b1c02011-09-28 22:20:48 +02001570static PyObject*
1571_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572{
1573 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001574 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001576
1577 assert(size >= 0);
1578 for (i = 0; i < size; i++) {
1579 if (u[i] > max_char) {
1580 max_char = u[i];
1581 if (max_char >= 0x10000)
1582 break;
1583 }
1584 }
1585 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 if (!res)
1587 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001588 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1590 else {
1591 int kind = PyUnicode_KIND(res);
1592 void *data = PyUnicode_DATA(res);
1593 for (i = 0; i < size; i++)
1594 PyUnicode_WRITE(kind, data, i, u[i]);
1595 }
1596 return res;
1597}
1598
1599PyObject*
1600PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1601{
1602 switch(kind) {
1603 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001604 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001606 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001608 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001609 default:
1610 assert(0 && "invalid kind");
1611 PyErr_SetString(PyExc_SystemError, "invalid kind");
1612 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614}
1615
Victor Stinner034f6cf2011-09-30 02:26:44 +02001616PyObject*
1617PyUnicode_Copy(PyObject *unicode)
1618{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001619 Py_ssize_t size;
1620 PyObject *copy;
1621 void *data;
1622
Victor Stinner034f6cf2011-09-30 02:26:44 +02001623 if (!PyUnicode_Check(unicode)) {
1624 PyErr_BadInternalCall();
1625 return NULL;
1626 }
1627 if (PyUnicode_READY(unicode))
1628 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001629
1630 size = PyUnicode_GET_LENGTH(unicode);
1631 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1632 if (!copy)
1633 return NULL;
1634 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1635
1636 data = PyUnicode_DATA(unicode);
1637 switch (PyUnicode_KIND(unicode))
1638 {
1639 case PyUnicode_1BYTE_KIND:
1640 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1641 break;
1642 case PyUnicode_2BYTE_KIND:
1643 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1644 break;
1645 case PyUnicode_4BYTE_KIND:
1646 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1647 break;
1648 default:
1649 assert(0);
1650 break;
1651 }
1652 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001653}
1654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655
Victor Stinnerbc603d12011-10-02 01:00:40 +02001656/* Widen Unicode objects to larger buffers. Don't write terminating null
1657 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658
1659void*
1660_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1661{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001662 Py_ssize_t len;
1663 void *result;
1664 unsigned int skind;
1665
1666 if (PyUnicode_READY(s))
1667 return NULL;
1668
1669 len = PyUnicode_GET_LENGTH(s);
1670 skind = PyUnicode_KIND(s);
1671 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001672 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 return NULL;
1674 }
1675 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676 case PyUnicode_2BYTE_KIND:
1677 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1678 if (!result)
1679 return PyErr_NoMemory();
1680 assert(skind == PyUnicode_1BYTE_KIND);
1681 _PyUnicode_CONVERT_BYTES(
1682 Py_UCS1, Py_UCS2,
1683 PyUnicode_1BYTE_DATA(s),
1684 PyUnicode_1BYTE_DATA(s) + len,
1685 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001687 case PyUnicode_4BYTE_KIND:
1688 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1689 if (!result)
1690 return PyErr_NoMemory();
1691 if (skind == PyUnicode_2BYTE_KIND) {
1692 _PyUnicode_CONVERT_BYTES(
1693 Py_UCS2, Py_UCS4,
1694 PyUnicode_2BYTE_DATA(s),
1695 PyUnicode_2BYTE_DATA(s) + len,
1696 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001698 else {
1699 assert(skind == PyUnicode_1BYTE_KIND);
1700 _PyUnicode_CONVERT_BYTES(
1701 Py_UCS1, Py_UCS4,
1702 PyUnicode_1BYTE_DATA(s),
1703 PyUnicode_1BYTE_DATA(s) + len,
1704 result);
1705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001707 default:
1708 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
Victor Stinner01698042011-10-04 00:04:26 +02001710 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 return NULL;
1712}
1713
1714static Py_UCS4*
1715as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1716 int copy_null)
1717{
1718 int kind;
1719 void *data;
1720 Py_ssize_t len, targetlen;
1721 if (PyUnicode_READY(string) == -1)
1722 return NULL;
1723 kind = PyUnicode_KIND(string);
1724 data = PyUnicode_DATA(string);
1725 len = PyUnicode_GET_LENGTH(string);
1726 targetlen = len;
1727 if (copy_null)
1728 targetlen++;
1729 if (!target) {
1730 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1731 PyErr_NoMemory();
1732 return NULL;
1733 }
1734 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1735 if (!target) {
1736 PyErr_NoMemory();
1737 return NULL;
1738 }
1739 }
1740 else {
1741 if (targetsize < targetlen) {
1742 PyErr_Format(PyExc_SystemError,
1743 "string is longer than the buffer");
1744 if (copy_null && 0 < targetsize)
1745 target[0] = 0;
1746 return NULL;
1747 }
1748 }
1749 if (kind != PyUnicode_4BYTE_KIND) {
1750 Py_ssize_t i;
1751 for (i = 0; i < len; i++)
1752 target[i] = PyUnicode_READ(kind, data, i);
1753 }
1754 else
1755 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1756 if (copy_null)
1757 target[len] = 0;
1758 return target;
1759}
1760
1761Py_UCS4*
1762PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1763 int copy_null)
1764{
1765 if (target == NULL || targetsize < 1) {
1766 PyErr_BadInternalCall();
1767 return NULL;
1768 }
1769 return as_ucs4(string, target, targetsize, copy_null);
1770}
1771
1772Py_UCS4*
1773PyUnicode_AsUCS4Copy(PyObject *string)
1774{
1775 return as_ucs4(string, NULL, 0, 1);
1776}
1777
1778#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001784 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 PyErr_BadInternalCall();
1787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 }
1789
Martin v. Löwis790465f2008-04-05 20:41:37 +00001790 if (size == -1) {
1791 size = wcslen(w);
1792 }
1793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001798
Walter Dörwald346737f2007-05-31 10:44:43 +00001799static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001800makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1801 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001802{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001803 *fmt++ = '%';
1804 if (width) {
1805 if (zeropad)
1806 *fmt++ = '0';
1807 fmt += sprintf(fmt, "%d", width);
1808 }
1809 if (precision)
1810 fmt += sprintf(fmt, ".%d", precision);
1811 if (longflag)
1812 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001813 else if (longlongflag) {
1814 /* longlongflag should only ever be nonzero on machines with
1815 HAVE_LONG_LONG defined */
1816#ifdef HAVE_LONG_LONG
1817 char *f = PY_FORMAT_LONG_LONG;
1818 while (*f)
1819 *fmt++ = *f++;
1820#else
1821 /* we shouldn't ever get here */
1822 assert(0);
1823 *fmt++ = 'l';
1824#endif
1825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 else if (size_tflag) {
1827 char *f = PY_FORMAT_SIZE_T;
1828 while (*f)
1829 *fmt++ = *f++;
1830 }
1831 *fmt++ = c;
1832 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001833}
1834
Victor Stinner96865452011-03-01 23:44:09 +00001835/* helper for PyUnicode_FromFormatV() */
1836
1837static const char*
1838parse_format_flags(const char *f,
1839 int *p_width, int *p_precision,
1840 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1841{
1842 int width, precision, longflag, longlongflag, size_tflag;
1843
1844 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1845 f++;
1846 width = 0;
1847 while (Py_ISDIGIT((unsigned)*f))
1848 width = (width*10) + *f++ - '0';
1849 precision = 0;
1850 if (*f == '.') {
1851 f++;
1852 while (Py_ISDIGIT((unsigned)*f))
1853 precision = (precision*10) + *f++ - '0';
1854 if (*f == '%') {
1855 /* "%.3%s" => f points to "3" */
1856 f--;
1857 }
1858 }
1859 if (*f == '\0') {
1860 /* bogus format "%.1" => go backward, f points to "1" */
1861 f--;
1862 }
1863 if (p_width != NULL)
1864 *p_width = width;
1865 if (p_precision != NULL)
1866 *p_precision = precision;
1867
1868 /* Handle %ld, %lu, %lld and %llu. */
1869 longflag = 0;
1870 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001871 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001872
1873 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001874 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001875 longflag = 1;
1876 ++f;
1877 }
1878#ifdef HAVE_LONG_LONG
1879 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001880 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001881 longlongflag = 1;
1882 f += 2;
1883 }
1884#endif
1885 }
1886 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001887 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001888 size_tflag = 1;
1889 ++f;
1890 }
1891 if (p_longflag != NULL)
1892 *p_longflag = longflag;
1893 if (p_longlongflag != NULL)
1894 *p_longlongflag = longlongflag;
1895 if (p_size_tflag != NULL)
1896 *p_size_tflag = size_tflag;
1897 return f;
1898}
1899
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001900/* maximum number of characters required for output of %ld. 21 characters
1901 allows for 64-bit integers (in decimal) and an optional sign. */
1902#define MAX_LONG_CHARS 21
1903/* maximum number of characters required for output of %lld.
1904 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1905 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1906#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1907
Walter Dörwaldd2034312007-05-18 16:29:38 +00001908PyObject *
1909PyUnicode_FromFormatV(const char *format, va_list vargs)
1910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001911 va_list count;
1912 Py_ssize_t callcount = 0;
1913 PyObject **callresults = NULL;
1914 PyObject **callresult = NULL;
1915 Py_ssize_t n = 0;
1916 int width = 0;
1917 int precision = 0;
1918 int zeropad;
1919 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001921 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001922 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1924 Py_UCS4 argmaxchar;
1925 Py_ssize_t numbersize = 0;
1926 char *numberresults = NULL;
1927 char *numberresult = NULL;
1928 Py_ssize_t i;
1929 int kind;
1930 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001931
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001932 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001933 /* step 1: count the number of %S/%R/%A/%s format specifications
1934 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1935 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 * result in an array)
1937 * also esimate a upper bound for all the number formats in the string,
1938 * numbers will be formated in step 3 and be keept in a '\0'-separated
1939 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001940 for (f = format; *f; f++) {
1941 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001942 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1944 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1945 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1946 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001949#ifdef HAVE_LONG_LONG
1950 if (longlongflag) {
1951 if (width < MAX_LONG_LONG_CHARS)
1952 width = MAX_LONG_LONG_CHARS;
1953 }
1954 else
1955#endif
1956 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1957 including sign. Decimal takes the most space. This
1958 isn't enough for octal. If a width is specified we
1959 need more (which we allocate later). */
1960 if (width < MAX_LONG_CHARS)
1961 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962
1963 /* account for the size + '\0' to separate numbers
1964 inside of the numberresults buffer */
1965 numbersize += (width + 1);
1966 }
1967 }
1968 else if ((unsigned char)*f > 127) {
1969 PyErr_Format(PyExc_ValueError,
1970 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1971 "string, got a non-ASCII byte: 0x%02x",
1972 (unsigned char)*f);
1973 return NULL;
1974 }
1975 }
1976 /* step 2: allocate memory for the results of
1977 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1978 if (callcount) {
1979 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1980 if (!callresults) {
1981 PyErr_NoMemory();
1982 return NULL;
1983 }
1984 callresult = callresults;
1985 }
1986 /* step 2.5: allocate memory for the results of formating numbers */
1987 if (numbersize) {
1988 numberresults = PyObject_Malloc(numbersize);
1989 if (!numberresults) {
1990 PyErr_NoMemory();
1991 goto fail;
1992 }
1993 numberresult = numberresults;
1994 }
1995
1996 /* step 3: format numbers and figure out how large a buffer we need */
1997 for (f = format; *f; f++) {
1998 if (*f == '%') {
1999 const char* p;
2000 int longflag;
2001 int longlongflag;
2002 int size_tflag;
2003 int numprinted;
2004
2005 p = f;
2006 zeropad = (f[1] == '0');
2007 f = parse_format_flags(f, &width, &precision,
2008 &longflag, &longlongflag, &size_tflag);
2009 switch (*f) {
2010 case 'c':
2011 {
2012 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002013 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 n++;
2015 break;
2016 }
2017 case '%':
2018 n++;
2019 break;
2020 case 'i':
2021 case 'd':
2022 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2023 width, precision, *f);
2024 if (longflag)
2025 numprinted = sprintf(numberresult, fmt,
2026 va_arg(count, long));
2027#ifdef HAVE_LONG_LONG
2028 else if (longlongflag)
2029 numprinted = sprintf(numberresult, fmt,
2030 va_arg(count, PY_LONG_LONG));
2031#endif
2032 else if (size_tflag)
2033 numprinted = sprintf(numberresult, fmt,
2034 va_arg(count, Py_ssize_t));
2035 else
2036 numprinted = sprintf(numberresult, fmt,
2037 va_arg(count, int));
2038 n += numprinted;
2039 /* advance by +1 to skip over the '\0' */
2040 numberresult += (numprinted + 1);
2041 assert(*(numberresult - 1) == '\0');
2042 assert(*(numberresult - 2) != '\0');
2043 assert(numprinted >= 0);
2044 assert(numberresult <= numberresults + numbersize);
2045 break;
2046 case 'u':
2047 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2048 width, precision, 'u');
2049 if (longflag)
2050 numprinted = sprintf(numberresult, fmt,
2051 va_arg(count, unsigned long));
2052#ifdef HAVE_LONG_LONG
2053 else if (longlongflag)
2054 numprinted = sprintf(numberresult, fmt,
2055 va_arg(count, unsigned PY_LONG_LONG));
2056#endif
2057 else if (size_tflag)
2058 numprinted = sprintf(numberresult, fmt,
2059 va_arg(count, size_t));
2060 else
2061 numprinted = sprintf(numberresult, fmt,
2062 va_arg(count, unsigned int));
2063 n += numprinted;
2064 numberresult += (numprinted + 1);
2065 assert(*(numberresult - 1) == '\0');
2066 assert(*(numberresult - 2) != '\0');
2067 assert(numprinted >= 0);
2068 assert(numberresult <= numberresults + numbersize);
2069 break;
2070 case 'x':
2071 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2072 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2073 n += numprinted;
2074 numberresult += (numprinted + 1);
2075 assert(*(numberresult - 1) == '\0');
2076 assert(*(numberresult - 2) != '\0');
2077 assert(numprinted >= 0);
2078 assert(numberresult <= numberresults + numbersize);
2079 break;
2080 case 'p':
2081 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2082 /* %p is ill-defined: ensure leading 0x. */
2083 if (numberresult[1] == 'X')
2084 numberresult[1] = 'x';
2085 else if (numberresult[1] != 'x') {
2086 memmove(numberresult + 2, numberresult,
2087 strlen(numberresult) + 1);
2088 numberresult[0] = '0';
2089 numberresult[1] = 'x';
2090 numprinted += 2;
2091 }
2092 n += numprinted;
2093 numberresult += (numprinted + 1);
2094 assert(*(numberresult - 1) == '\0');
2095 assert(*(numberresult - 2) != '\0');
2096 assert(numprinted >= 0);
2097 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 break;
2099 case 's':
2100 {
2101 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002102 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002103 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2104 if (!str)
2105 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 /* since PyUnicode_DecodeUTF8 returns already flexible
2107 unicode objects, there is no need to call ready on them */
2108 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002109 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002111 /* Remember the str and switch to the next slot */
2112 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002113 break;
2114 }
2115 case 'U':
2116 {
2117 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002118 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 if (PyUnicode_READY(obj) == -1)
2120 goto fail;
2121 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002122 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002124 break;
2125 }
2126 case 'V':
2127 {
2128 PyObject *obj = va_arg(count, PyObject *);
2129 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002130 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002131 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002132 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002133 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 if (PyUnicode_READY(obj) == -1)
2135 goto fail;
2136 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002137 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002139 *callresult++ = NULL;
2140 }
2141 else {
2142 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2143 if (!str_obj)
2144 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002145 if (PyUnicode_READY(str_obj)) {
2146 Py_DECREF(str_obj);
2147 goto fail;
2148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002150 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002152 *callresult++ = str_obj;
2153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 break;
2155 }
2156 case 'S':
2157 {
2158 PyObject *obj = va_arg(count, PyObject *);
2159 PyObject *str;
2160 assert(obj);
2161 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002165 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 /* Remember the str and switch to the next slot */
2168 *callresult++ = str;
2169 break;
2170 }
2171 case 'R':
2172 {
2173 PyObject *obj = va_arg(count, PyObject *);
2174 PyObject *repr;
2175 assert(obj);
2176 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002180 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002182 /* Remember the repr and switch to the next slot */
2183 *callresult++ = repr;
2184 break;
2185 }
2186 case 'A':
2187 {
2188 PyObject *obj = va_arg(count, PyObject *);
2189 PyObject *ascii;
2190 assert(obj);
2191 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002195 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002197 /* Remember the repr and switch to the next slot */
2198 *callresult++ = ascii;
2199 break;
2200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002201 default:
2202 /* if we stumble upon an unknown
2203 formatting code, copy the rest of
2204 the format string to the output
2205 string. (we cannot just skip the
2206 code, since there's no way to know
2207 what's in the argument list) */
2208 n += strlen(p);
2209 goto expand;
2210 }
2211 } else
2212 n++;
2213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002214 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002215 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 we don't have to resize the string.
2218 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 if (!string)
2221 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 kind = PyUnicode_KIND(string);
2223 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002229 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002230
2231 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2233 /* checking for == because the last argument could be a empty
2234 string, which causes i to point to end, the assert at the end of
2235 the loop */
2236 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237
Benjamin Peterson14339b62009-01-31 16:36:08 +00002238 switch (*f) {
2239 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002240 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 const int ordinal = va_arg(vargs, int);
2242 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002243 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002244 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002245 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 case 'p':
2250 /* unused, since we already have the result */
2251 if (*f == 'p')
2252 (void) va_arg(vargs, void *);
2253 else
2254 (void) va_arg(vargs, int);
2255 /* extract the result from numberresults and append. */
2256 for (; *numberresult; ++i, ++numberresult)
2257 PyUnicode_WRITE(kind, data, i, *numberresult);
2258 /* skip over the separating '\0' */
2259 assert(*numberresult == '\0');
2260 numberresult++;
2261 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002262 break;
2263 case 's':
2264 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002265 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002267 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 size = PyUnicode_GET_LENGTH(*callresult);
2269 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002270 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2271 *callresult, 0,
2272 size) < 0)
2273 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002275 /* We're done with the unicode()/repr() => forget it */
2276 Py_DECREF(*callresult);
2277 /* switch to next unicode()/repr() result */
2278 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002279 break;
2280 }
2281 case 'U':
2282 {
2283 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 Py_ssize_t size;
2285 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2286 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002287 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2288 obj, 0,
2289 size) < 0)
2290 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 break;
2293 }
2294 case 'V':
2295 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002298 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300 size = PyUnicode_GET_LENGTH(obj);
2301 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002302 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2303 obj, 0,
2304 size) < 0)
2305 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 size = PyUnicode_GET_LENGTH(*callresult);
2309 assert(PyUnicode_KIND(*callresult) <=
2310 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002311 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2312 *callresult,
2313 0, size) < 0)
2314 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002316 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002317 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002318 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 break;
2320 }
2321 case 'S':
2322 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002323 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 /* unused, since we already have the result */
2326 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002328 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2329 *callresult, 0,
2330 PyUnicode_GET_LENGTH(*callresult)) < 0)
2331 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002333 /* We're done with the unicode()/repr() => forget it */
2334 Py_DECREF(*callresult);
2335 /* switch to next unicode()/repr() result */
2336 ++callresult;
2337 break;
2338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002339 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 break;
2342 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 for (; *p; ++p, ++i)
2344 PyUnicode_WRITE(kind, data, i, *p);
2345 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002346 goto end;
2347 }
Victor Stinner1205f272010-09-11 00:54:47 +00002348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 else {
2350 assert(i < PyUnicode_GET_LENGTH(string));
2351 PyUnicode_WRITE(kind, data, i++, *f);
2352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 if (callresults)
2358 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 if (numberresults)
2360 PyObject_Free(numberresults);
2361 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002362 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 if (callresults) {
2364 PyObject **callresult2 = callresults;
2365 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002366 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002367 ++callresult2;
2368 }
2369 PyObject_Free(callresults);
2370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 if (numberresults)
2372 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002374}
2375
Walter Dörwaldd2034312007-05-18 16:29:38 +00002376PyObject *
2377PyUnicode_FromFormat(const char *format, ...)
2378{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 PyObject* ret;
2380 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002381
2382#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002383 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002384#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002386#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 ret = PyUnicode_FromFormatV(format, vargs);
2388 va_end(vargs);
2389 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002390}
2391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392#ifdef HAVE_WCHAR_H
2393
Victor Stinner5593d8a2010-10-02 11:11:27 +00002394/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2395 convert a Unicode object to a wide character string.
2396
Victor Stinnerd88d9832011-09-06 02:00:05 +02002397 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002398 character) required to convert the unicode object. Ignore size argument.
2399
Victor Stinnerd88d9832011-09-06 02:00:05 +02002400 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002401 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002402 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002403static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002404unicode_aswidechar(PyUnicodeObject *unicode,
2405 wchar_t *w,
2406 Py_ssize_t size)
2407{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002408 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 const wchar_t *wstr;
2410
2411 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2412 if (wstr == NULL)
2413 return -1;
2414
Victor Stinner5593d8a2010-10-02 11:11:27 +00002415 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002416 if (size > res)
2417 size = res + 1;
2418 else
2419 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002421 return res;
2422 }
2423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002425}
2426
2427Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002428PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002429 wchar_t *w,
2430 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431{
2432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002433 PyErr_BadInternalCall();
2434 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002436 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437}
2438
Victor Stinner137c34c2010-09-29 10:25:54 +00002439wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002440PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002441 Py_ssize_t *size)
2442{
2443 wchar_t* buffer;
2444 Py_ssize_t buflen;
2445
2446 if (unicode == NULL) {
2447 PyErr_BadInternalCall();
2448 return NULL;
2449 }
2450
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002451 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 if (buflen == -1)
2453 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002454 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002455 PyErr_NoMemory();
2456 return NULL;
2457 }
2458
Victor Stinner137c34c2010-09-29 10:25:54 +00002459 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2460 if (buffer == NULL) {
2461 PyErr_NoMemory();
2462 return NULL;
2463 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002464 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 if (buflen == -1)
2466 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002467 if (size != NULL)
2468 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002469 return buffer;
2470}
2471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473
Alexander Belopolsky40018472011-02-26 01:02:56 +00002474PyObject *
2475PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002478 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002479 PyErr_SetString(PyExc_ValueError,
2480 "chr() arg not in range(0x110000)");
2481 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002482 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 if (ordinal < 256)
2485 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 v = PyUnicode_New(1, ordinal);
2488 if (v == NULL)
2489 return NULL;
2490 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2491 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002492}
2493
Alexander Belopolsky40018472011-02-26 01:02:56 +00002494PyObject *
2495PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002497 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002498 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002499 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002500 if (PyUnicode_READY(obj))
2501 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002502 Py_INCREF(obj);
2503 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002504 }
2505 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002506 /* For a Unicode subtype that's not a Unicode object,
2507 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002508 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002509 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002510 PyErr_Format(PyExc_TypeError,
2511 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002512 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002513 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002514}
2515
Alexander Belopolsky40018472011-02-26 01:02:56 +00002516PyObject *
2517PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002518 const char *encoding,
2519 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002520{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002521 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002522 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002523
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 PyErr_BadInternalCall();
2526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002528
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002529 /* Decoding bytes objects is the most common case and should be fast */
2530 if (PyBytes_Check(obj)) {
2531 if (PyBytes_GET_SIZE(obj) == 0) {
2532 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002533 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002534 }
2535 else {
2536 v = PyUnicode_Decode(
2537 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2538 encoding, errors);
2539 }
2540 return v;
2541 }
2542
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002543 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002544 PyErr_SetString(PyExc_TypeError,
2545 "decoding str is not supported");
2546 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002548
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002549 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2550 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2551 PyErr_Format(PyExc_TypeError,
2552 "coercing to str: need bytes, bytearray "
2553 "or buffer-like object, %.80s found",
2554 Py_TYPE(obj)->tp_name);
2555 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002556 }
Tim Petersced69f82003-09-16 20:30:58 +00002557
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002558 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002560 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
Tim Petersced69f82003-09-16 20:30:58 +00002562 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002563 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002564
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002565 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002566 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567}
2568
Victor Stinner600d3be2010-06-10 12:00:55 +00002569/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002570 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2571 1 on success. */
2572static int
2573normalize_encoding(const char *encoding,
2574 char *lower,
2575 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002577 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002578 char *l;
2579 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002581 e = encoding;
2582 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002583 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002584 while (*e) {
2585 if (l == l_end)
2586 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002587 if (Py_ISUPPER(*e)) {
2588 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002589 }
2590 else if (*e == '_') {
2591 *l++ = '-';
2592 e++;
2593 }
2594 else {
2595 *l++ = *e++;
2596 }
2597 }
2598 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002599 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002600}
2601
Alexander Belopolsky40018472011-02-26 01:02:56 +00002602PyObject *
2603PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002604 Py_ssize_t size,
2605 const char *encoding,
2606 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002607{
2608 PyObject *buffer = NULL, *unicode;
2609 Py_buffer info;
2610 char lower[11]; /* Enough for any encoding shortcut */
2611
2612 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002613 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002614
2615 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002616 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002617 if ((strcmp(lower, "utf-8") == 0) ||
2618 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002619 return PyUnicode_DecodeUTF8(s, size, errors);
2620 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002621 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002622 (strcmp(lower, "iso-8859-1") == 0))
2623 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002624#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002625 else if (strcmp(lower, "mbcs") == 0)
2626 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002627#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002628 else if (strcmp(lower, "ascii") == 0)
2629 return PyUnicode_DecodeASCII(s, size, errors);
2630 else if (strcmp(lower, "utf-16") == 0)
2631 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2632 else if (strcmp(lower, "utf-32") == 0)
2633 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635
2636 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002637 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002638 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002639 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002640 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 if (buffer == NULL)
2642 goto onError;
2643 unicode = PyCodec_Decode(buffer, encoding, errors);
2644 if (unicode == NULL)
2645 goto onError;
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002648 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002649 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 Py_DECREF(unicode);
2651 goto onError;
2652 }
2653 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002654#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002655 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 Py_DECREF(unicode);
2657 return NULL;
2658 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002659#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002661
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 Py_XDECREF(buffer);
2664 return NULL;
2665}
2666
Alexander Belopolsky40018472011-02-26 01:02:56 +00002667PyObject *
2668PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002669 const char *encoding,
2670 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002671{
2672 PyObject *v;
2673
2674 if (!PyUnicode_Check(unicode)) {
2675 PyErr_BadArgument();
2676 goto onError;
2677 }
2678
2679 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002681
2682 /* Decode via the codec registry */
2683 v = PyCodec_Decode(unicode, encoding, errors);
2684 if (v == NULL)
2685 goto onError;
2686 return v;
2687
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002689 return NULL;
2690}
2691
Alexander Belopolsky40018472011-02-26 01:02:56 +00002692PyObject *
2693PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002694 const char *encoding,
2695 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002696{
2697 PyObject *v;
2698
2699 if (!PyUnicode_Check(unicode)) {
2700 PyErr_BadArgument();
2701 goto onError;
2702 }
2703
2704 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002706
2707 /* Decode via the codec registry */
2708 v = PyCodec_Decode(unicode, encoding, errors);
2709 if (v == NULL)
2710 goto onError;
2711 if (!PyUnicode_Check(v)) {
2712 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002713 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002714 Py_TYPE(v)->tp_name);
2715 Py_DECREF(v);
2716 goto onError;
2717 }
2718 return v;
2719
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002721 return NULL;
2722}
2723
Alexander Belopolsky40018472011-02-26 01:02:56 +00002724PyObject *
2725PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002726 Py_ssize_t size,
2727 const char *encoding,
2728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729{
2730 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002731
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 unicode = PyUnicode_FromUnicode(s, size);
2733 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2736 Py_DECREF(unicode);
2737 return v;
2738}
2739
Alexander Belopolsky40018472011-02-26 01:02:56 +00002740PyObject *
2741PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002742 const char *encoding,
2743 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002744{
2745 PyObject *v;
2746
2747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 goto onError;
2750 }
2751
2752 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002754
2755 /* Encode via the codec registry */
2756 v = PyCodec_Encode(unicode, encoding, errors);
2757 if (v == NULL)
2758 goto onError;
2759 return v;
2760
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002762 return NULL;
2763}
2764
Victor Stinnerad158722010-10-27 00:25:46 +00002765PyObject *
2766PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002767{
Victor Stinner99b95382011-07-04 14:23:54 +02002768#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002769 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2770 PyUnicode_GET_SIZE(unicode),
2771 NULL);
2772#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002774#else
Victor Stinner793b5312011-04-27 00:24:21 +02002775 PyInterpreterState *interp = PyThreadState_GET()->interp;
2776 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2777 cannot use it to encode and decode filenames before it is loaded. Load
2778 the Python codec requires to encode at least its own filename. Use the C
2779 version of the locale codec until the codec registry is initialized and
2780 the Python codec is loaded.
2781
2782 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2783 cannot only rely on it: check also interp->fscodec_initialized for
2784 subinterpreters. */
2785 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002786 return PyUnicode_AsEncodedString(unicode,
2787 Py_FileSystemDefaultEncoding,
2788 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002789 }
2790 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002791 /* locale encoding with surrogateescape */
2792 wchar_t *wchar;
2793 char *bytes;
2794 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002795 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002796
2797 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2798 if (wchar == NULL)
2799 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002800 bytes = _Py_wchar2char(wchar, &error_pos);
2801 if (bytes == NULL) {
2802 if (error_pos != (size_t)-1) {
2803 char *errmsg = strerror(errno);
2804 PyObject *exc = NULL;
2805 if (errmsg == NULL)
2806 errmsg = "Py_wchar2char() failed";
2807 raise_encode_exception(&exc,
2808 "filesystemencoding",
2809 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2810 error_pos, error_pos+1,
2811 errmsg);
2812 Py_XDECREF(exc);
2813 }
2814 else
2815 PyErr_NoMemory();
2816 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002817 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002818 }
2819 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002820
2821 bytes_obj = PyBytes_FromString(bytes);
2822 PyMem_Free(bytes);
2823 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002824 }
Victor Stinnerad158722010-10-27 00:25:46 +00002825#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002826}
2827
Alexander Belopolsky40018472011-02-26 01:02:56 +00002828PyObject *
2829PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002830 const char *encoding,
2831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002834 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 if (!PyUnicode_Check(unicode)) {
2837 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 }
Fred Drakee4315f52000-05-09 19:53:39 +00002840
Victor Stinner2f283c22011-03-02 01:21:46 +00002841 if (encoding == NULL) {
2842 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002843 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002844 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002845 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002846 }
Fred Drakee4315f52000-05-09 19:53:39 +00002847
2848 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002849 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002850 if ((strcmp(lower, "utf-8") == 0) ||
2851 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002852 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002853 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002855 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002857 }
Victor Stinner37296e82010-06-10 13:36:23 +00002858 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002859 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002860 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002862#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002863 else if (strcmp(lower, "mbcs") == 0)
2864 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2865 PyUnicode_GET_SIZE(unicode),
2866 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002867#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002868 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
2872 /* Encode via the codec registry */
2873 v = PyCodec_Encode(unicode, encoding, errors);
2874 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002875 return NULL;
2876
2877 /* The normal path */
2878 if (PyBytes_Check(v))
2879 return v;
2880
2881 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002882 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002883 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002884 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002885
2886 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2887 "encoder %s returned bytearray instead of bytes",
2888 encoding);
2889 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002890 Py_DECREF(v);
2891 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002892 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002893
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002894 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2895 Py_DECREF(v);
2896 return b;
2897 }
2898
2899 PyErr_Format(PyExc_TypeError,
2900 "encoder did not return a bytes object (type=%.400s)",
2901 Py_TYPE(v)->tp_name);
2902 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002903 return NULL;
2904}
2905
Alexander Belopolsky40018472011-02-26 01:02:56 +00002906PyObject *
2907PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002908 const char *encoding,
2909 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002910{
2911 PyObject *v;
2912
2913 if (!PyUnicode_Check(unicode)) {
2914 PyErr_BadArgument();
2915 goto onError;
2916 }
2917
2918 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002920
2921 /* Encode via the codec registry */
2922 v = PyCodec_Encode(unicode, encoding, errors);
2923 if (v == NULL)
2924 goto onError;
2925 if (!PyUnicode_Check(v)) {
2926 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002927 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002928 Py_TYPE(v)->tp_name);
2929 Py_DECREF(v);
2930 goto onError;
2931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002933
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 return NULL;
2936}
2937
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002938PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002939PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002940 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002941 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2942}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002943
Christian Heimes5894ba72007-11-04 11:43:14 +00002944PyObject*
2945PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2946{
Victor Stinner99b95382011-07-04 14:23:54 +02002947#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002948 return PyUnicode_DecodeMBCS(s, size, NULL);
2949#elif defined(__APPLE__)
2950 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2951#else
Victor Stinner793b5312011-04-27 00:24:21 +02002952 PyInterpreterState *interp = PyThreadState_GET()->interp;
2953 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2954 cannot use it to encode and decode filenames before it is loaded. Load
2955 the Python codec requires to encode at least its own filename. Use the C
2956 version of the locale codec until the codec registry is initialized and
2957 the Python codec is loaded.
2958
2959 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2960 cannot only rely on it: check also interp->fscodec_initialized for
2961 subinterpreters. */
2962 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002963 return PyUnicode_Decode(s, size,
2964 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002965 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002966 }
2967 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002968 /* locale encoding with surrogateescape */
2969 wchar_t *wchar;
2970 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002971 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002972
2973 if (s[size] != '\0' || size != strlen(s)) {
2974 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2975 return NULL;
2976 }
2977
Victor Stinner168e1172010-10-16 23:16:16 +00002978 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002979 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002980 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002981
Victor Stinner168e1172010-10-16 23:16:16 +00002982 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002983 PyMem_Free(wchar);
2984 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002985 }
Victor Stinnerad158722010-10-27 00:25:46 +00002986#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002987}
2988
Martin v. Löwis011e8422009-05-05 04:43:17 +00002989
2990int
2991PyUnicode_FSConverter(PyObject* arg, void* addr)
2992{
2993 PyObject *output = NULL;
2994 Py_ssize_t size;
2995 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002996 if (arg == NULL) {
2997 Py_DECREF(*(PyObject**)addr);
2998 return 1;
2999 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003000 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003001 output = arg;
3002 Py_INCREF(output);
3003 }
3004 else {
3005 arg = PyUnicode_FromObject(arg);
3006 if (!arg)
3007 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003008 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003009 Py_DECREF(arg);
3010 if (!output)
3011 return 0;
3012 if (!PyBytes_Check(output)) {
3013 Py_DECREF(output);
3014 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3015 return 0;
3016 }
3017 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003018 size = PyBytes_GET_SIZE(output);
3019 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003020 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003021 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003022 Py_DECREF(output);
3023 return 0;
3024 }
3025 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003026 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003027}
3028
3029
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003030int
3031PyUnicode_FSDecoder(PyObject* arg, void* addr)
3032{
3033 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003034 if (arg == NULL) {
3035 Py_DECREF(*(PyObject**)addr);
3036 return 1;
3037 }
3038 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003039 if (PyUnicode_READY(arg))
3040 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003041 output = arg;
3042 Py_INCREF(output);
3043 }
3044 else {
3045 arg = PyBytes_FromObject(arg);
3046 if (!arg)
3047 return 0;
3048 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3049 PyBytes_GET_SIZE(arg));
3050 Py_DECREF(arg);
3051 if (!output)
3052 return 0;
3053 if (!PyUnicode_Check(output)) {
3054 Py_DECREF(output);
3055 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3056 return 0;
3057 }
3058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3060 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003061 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3062 Py_DECREF(output);
3063 return 0;
3064 }
3065 *(PyObject**)addr = output;
3066 return Py_CLEANUP_SUPPORTED;
3067}
3068
3069
Martin v. Löwis5b222132007-06-10 09:51:05 +00003070char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003071PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003072{
Christian Heimesf3863112007-11-22 07:46:41 +00003073 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003074 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3075
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003076 if (!PyUnicode_Check(unicode)) {
3077 PyErr_BadArgument();
3078 return NULL;
3079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003080 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003081 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003083 if (PyUnicode_UTF8(unicode) == NULL) {
3084 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3086 if (bytes == NULL)
3087 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003088 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3089 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 Py_DECREF(bytes);
3091 return NULL;
3092 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003093 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3094 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003095 Py_DECREF(bytes);
3096 }
3097
3098 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003099 *psize = PyUnicode_UTF8_LENGTH(unicode);
3100 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003101}
3102
3103char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003104PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3107}
3108
3109#ifdef Py_DEBUG
3110int unicode_as_unicode_calls = 0;
3111#endif
3112
3113
3114Py_UNICODE *
3115PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3116{
3117 PyUnicodeObject *u;
3118 const unsigned char *one_byte;
3119#if SIZEOF_WCHAR_T == 4
3120 const Py_UCS2 *two_bytes;
3121#else
3122 const Py_UCS4 *four_bytes;
3123 const Py_UCS4 *ucs4_end;
3124 Py_ssize_t num_surrogates;
3125#endif
3126 wchar_t *w;
3127 wchar_t *wchar_end;
3128
3129 if (!PyUnicode_Check(unicode)) {
3130 PyErr_BadArgument();
3131 return NULL;
3132 }
3133 u = (PyUnicodeObject*)unicode;
3134 if (_PyUnicode_WSTR(u) == NULL) {
3135 /* Non-ASCII compact unicode object */
3136 assert(_PyUnicode_KIND(u) != 0);
3137 assert(PyUnicode_IS_READY(u));
3138
3139#ifdef Py_DEBUG
3140 ++unicode_as_unicode_calls;
3141#endif
3142
3143 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3144#if SIZEOF_WCHAR_T == 2
3145 four_bytes = PyUnicode_4BYTE_DATA(u);
3146 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3147 num_surrogates = 0;
3148
3149 for (; four_bytes < ucs4_end; ++four_bytes) {
3150 if (*four_bytes > 0xFFFF)
3151 ++num_surrogates;
3152 }
3153
3154 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3155 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3156 if (!_PyUnicode_WSTR(u)) {
3157 PyErr_NoMemory();
3158 return NULL;
3159 }
3160 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3161
3162 w = _PyUnicode_WSTR(u);
3163 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3164 four_bytes = PyUnicode_4BYTE_DATA(u);
3165 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3166 if (*four_bytes > 0xFFFF) {
3167 /* encode surrogate pair in this case */
3168 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3169 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3170 }
3171 else
3172 *w = *four_bytes;
3173
3174 if (w > wchar_end) {
3175 assert(0 && "Miscalculated string end");
3176 }
3177 }
3178 *w = 0;
3179#else
3180 /* sizeof(wchar_t) == 4 */
3181 Py_FatalError("Impossible unicode object state, wstr and str "
3182 "should share memory already.");
3183 return NULL;
3184#endif
3185 }
3186 else {
3187 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3188 (_PyUnicode_LENGTH(u) + 1));
3189 if (!_PyUnicode_WSTR(u)) {
3190 PyErr_NoMemory();
3191 return NULL;
3192 }
3193 if (!PyUnicode_IS_COMPACT_ASCII(u))
3194 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3195 w = _PyUnicode_WSTR(u);
3196 wchar_end = w + _PyUnicode_LENGTH(u);
3197
3198 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3199 one_byte = PyUnicode_1BYTE_DATA(u);
3200 for (; w < wchar_end; ++one_byte, ++w)
3201 *w = *one_byte;
3202 /* null-terminate the wstr */
3203 *w = 0;
3204 }
3205 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3206#if SIZEOF_WCHAR_T == 4
3207 two_bytes = PyUnicode_2BYTE_DATA(u);
3208 for (; w < wchar_end; ++two_bytes, ++w)
3209 *w = *two_bytes;
3210 /* null-terminate the wstr */
3211 *w = 0;
3212#else
3213 /* sizeof(wchar_t) == 2 */
3214 PyObject_FREE(_PyUnicode_WSTR(u));
3215 _PyUnicode_WSTR(u) = NULL;
3216 Py_FatalError("Impossible unicode object state, wstr "
3217 "and str should share memory already.");
3218 return NULL;
3219#endif
3220 }
3221 else {
3222 assert(0 && "This should never happen.");
3223 }
3224 }
3225 }
3226 if (size != NULL)
3227 *size = PyUnicode_WSTR_LENGTH(u);
3228 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003229}
3230
Alexander Belopolsky40018472011-02-26 01:02:56 +00003231Py_UNICODE *
3232PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003234 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235}
3236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238Py_ssize_t
3239PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240{
3241 if (!PyUnicode_Check(unicode)) {
3242 PyErr_BadArgument();
3243 goto onError;
3244 }
3245 return PyUnicode_GET_SIZE(unicode);
3246
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 return -1;
3249}
3250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003251Py_ssize_t
3252PyUnicode_GetLength(PyObject *unicode)
3253{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003254 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255 PyErr_BadArgument();
3256 return -1;
3257 }
3258
3259 return PyUnicode_GET_LENGTH(unicode);
3260}
3261
3262Py_UCS4
3263PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3264{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003265 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3266 PyErr_BadArgument();
3267 return (Py_UCS4)-1;
3268 }
3269 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3270 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271 return (Py_UCS4)-1;
3272 }
3273 return PyUnicode_READ_CHAR(unicode, index);
3274}
3275
3276int
3277PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3278{
3279 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003280 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003281 return -1;
3282 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003283 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3284 PyErr_SetString(PyExc_IndexError, "string index out of range");
3285 return -1;
3286 }
3287 if (_PyUnicode_Dirty(unicode))
3288 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003289 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3290 index, ch);
3291 return 0;
3292}
3293
Alexander Belopolsky40018472011-02-26 01:02:56 +00003294const char *
3295PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003296{
Victor Stinner42cb4622010-09-01 19:39:01 +00003297 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003298}
3299
Victor Stinner554f3f02010-06-16 23:33:54 +00003300/* create or adjust a UnicodeDecodeError */
3301static void
3302make_decode_exception(PyObject **exceptionObject,
3303 const char *encoding,
3304 const char *input, Py_ssize_t length,
3305 Py_ssize_t startpos, Py_ssize_t endpos,
3306 const char *reason)
3307{
3308 if (*exceptionObject == NULL) {
3309 *exceptionObject = PyUnicodeDecodeError_Create(
3310 encoding, input, length, startpos, endpos, reason);
3311 }
3312 else {
3313 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3314 goto onError;
3315 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3316 goto onError;
3317 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3318 goto onError;
3319 }
3320 return;
3321
3322onError:
3323 Py_DECREF(*exceptionObject);
3324 *exceptionObject = NULL;
3325}
3326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327/* error handling callback helper:
3328 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003329 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 and adjust various state variables.
3331 return 0 on success, -1 on error
3332*/
3333
Alexander Belopolsky40018472011-02-26 01:02:56 +00003334static int
3335unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003336 const char *encoding, const char *reason,
3337 const char **input, const char **inend, Py_ssize_t *startinpos,
3338 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3339 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003341 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342
3343 PyObject *restuple = NULL;
3344 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003345 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003346 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 Py_ssize_t requiredsize;
3348 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003349 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003350 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003351 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 int res = -1;
3353
3354 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 *errorHandler = PyCodec_LookupError(errors);
3356 if (*errorHandler == NULL)
3357 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 }
3359
Victor Stinner554f3f02010-06-16 23:33:54 +00003360 make_decode_exception(exceptionObject,
3361 encoding,
3362 *input, *inend - *input,
3363 *startinpos, *endinpos,
3364 reason);
3365 if (*exceptionObject == NULL)
3366 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367
3368 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3369 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003372 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 }
3375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003377
3378 /* Copy back the bytes variables, which might have been modified by the
3379 callback */
3380 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3381 if (!inputobj)
3382 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003383 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003385 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003386 *input = PyBytes_AS_STRING(inputobj);
3387 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003388 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003389 /* we can DECREF safely, as the exception has another reference,
3390 so the object won't go away. */
3391 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003395 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3397 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399
3400 /* need more space? (at least enough for what we
3401 have+the replacement+the rest of the string (starting
3402 at the new input position), so we won't have to check space
3403 when there are no errors in the rest of the string) */
3404 repptr = PyUnicode_AS_UNICODE(repunicode);
3405 repsize = PyUnicode_GET_SIZE(repunicode);
3406 requiredsize = *outpos + repsize + insize-newpos;
3407 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 if (requiredsize<2*outsize)
3409 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003410 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 goto onError;
3412 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 }
3414 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003415 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 Py_UNICODE_COPY(*outptr, repptr, repsize);
3417 *outptr += repsize;
3418 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 /* we made it! */
3421 res = 0;
3422
Benjamin Peterson29060642009-01-31 22:14:21 +00003423 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 Py_XDECREF(restuple);
3425 return res;
3426}
3427
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003428/* --- UTF-7 Codec -------------------------------------------------------- */
3429
Antoine Pitrou244651a2009-05-04 18:56:13 +00003430/* See RFC2152 for details. We encode conservatively and decode liberally. */
3431
3432/* Three simple macros defining base-64. */
3433
3434/* Is c a base-64 character? */
3435
3436#define IS_BASE64(c) \
3437 (((c) >= 'A' && (c) <= 'Z') || \
3438 ((c) >= 'a' && (c) <= 'z') || \
3439 ((c) >= '0' && (c) <= '9') || \
3440 (c) == '+' || (c) == '/')
3441
3442/* given that c is a base-64 character, what is its base-64 value? */
3443
3444#define FROM_BASE64(c) \
3445 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3446 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3447 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3448 (c) == '+' ? 62 : 63)
3449
3450/* What is the base-64 character of the bottom 6 bits of n? */
3451
3452#define TO_BASE64(n) \
3453 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3454
3455/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3456 * decoded as itself. We are permissive on decoding; the only ASCII
3457 * byte not decoding to itself is the + which begins a base64
3458 * string. */
3459
3460#define DECODE_DIRECT(c) \
3461 ((c) <= 127 && (c) != '+')
3462
3463/* The UTF-7 encoder treats ASCII characters differently according to
3464 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3465 * the above). See RFC2152. This array identifies these different
3466 * sets:
3467 * 0 : "Set D"
3468 * alphanumeric and '(),-./:?
3469 * 1 : "Set O"
3470 * !"#$%&*;<=>@[]^_`{|}
3471 * 2 : "whitespace"
3472 * ht nl cr sp
3473 * 3 : special (must be base64 encoded)
3474 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3475 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003476
Tim Petersced69f82003-09-16 20:30:58 +00003477static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003478char utf7_category[128] = {
3479/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3480 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3481/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3482 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3483/* sp ! " # $ % & ' ( ) * + , - . / */
3484 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3485/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3487/* @ A B C D E F G H I J K L M N O */
3488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3489/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3491/* ` a b c d e f g h i j k l m n o */
3492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3493/* p q r s t u v w x y z { | } ~ del */
3494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003495};
3496
Antoine Pitrou244651a2009-05-04 18:56:13 +00003497/* ENCODE_DIRECT: this character should be encoded as itself. The
3498 * answer depends on whether we are encoding set O as itself, and also
3499 * on whether we are encoding whitespace as itself. RFC2152 makes it
3500 * clear that the answers to these questions vary between
3501 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003502
Antoine Pitrou244651a2009-05-04 18:56:13 +00003503#define ENCODE_DIRECT(c, directO, directWS) \
3504 ((c) < 128 && (c) > 0 && \
3505 ((utf7_category[(c)] == 0) || \
3506 (directWS && (utf7_category[(c)] == 2)) || \
3507 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 Py_ssize_t size,
3512 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003513{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003514 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3515}
3516
Antoine Pitrou244651a2009-05-04 18:56:13 +00003517/* The decoder. The only state we preserve is our read position,
3518 * i.e. how many characters we have consumed. So if we end in the
3519 * middle of a shift sequence we have to back off the read position
3520 * and the output to the beginning of the sequence, otherwise we lose
3521 * all the shift state (seen bits, number of bits seen, high
3522 * surrogate). */
3523
Alexander Belopolsky40018472011-02-26 01:02:56 +00003524PyObject *
3525PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003526 Py_ssize_t size,
3527 const char *errors,
3528 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003531 Py_ssize_t startinpos;
3532 Py_ssize_t endinpos;
3533 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003534 const char *e;
3535 PyUnicodeObject *unicode;
3536 Py_UNICODE *p;
3537 const char *errmsg = "";
3538 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003539 Py_UNICODE *shiftOutStart;
3540 unsigned int base64bits = 0;
3541 unsigned long base64buffer = 0;
3542 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 PyObject *errorHandler = NULL;
3544 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003545
3546 unicode = _PyUnicode_New(size);
3547 if (!unicode)
3548 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003549 if (size == 0) {
3550 if (consumed)
3551 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003552 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003553 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003556 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003557 e = s + size;
3558
3559 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003562 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003563
Antoine Pitrou244651a2009-05-04 18:56:13 +00003564 if (inShift) { /* in a base-64 section */
3565 if (IS_BASE64(ch)) { /* consume a base-64 character */
3566 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3567 base64bits += 6;
3568 s++;
3569 if (base64bits >= 16) {
3570 /* we have enough bits for a UTF-16 value */
3571 Py_UNICODE outCh = (Py_UNICODE)
3572 (base64buffer >> (base64bits-16));
3573 base64bits -= 16;
3574 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3575 if (surrogate) {
3576 /* expecting a second surrogate */
3577 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3578#ifdef Py_UNICODE_WIDE
3579 *p++ = (((surrogate & 0x3FF)<<10)
3580 | (outCh & 0x3FF)) + 0x10000;
3581#else
3582 *p++ = surrogate;
3583 *p++ = outCh;
3584#endif
3585 surrogate = 0;
3586 }
3587 else {
3588 surrogate = 0;
3589 errmsg = "second surrogate missing";
3590 goto utf7Error;
3591 }
3592 }
3593 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3594 /* first surrogate */
3595 surrogate = outCh;
3596 }
3597 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3598 errmsg = "unexpected second surrogate";
3599 goto utf7Error;
3600 }
3601 else {
3602 *p++ = outCh;
3603 }
3604 }
3605 }
3606 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003607 inShift = 0;
3608 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003609 if (surrogate) {
3610 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003611 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003612 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 if (base64bits > 0) { /* left-over bits */
3614 if (base64bits >= 6) {
3615 /* We've seen at least one base-64 character */
3616 errmsg = "partial character in shift sequence";
3617 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 else {
3620 /* Some bits remain; they should be zero */
3621 if (base64buffer != 0) {
3622 errmsg = "non-zero padding bits in shift sequence";
3623 goto utf7Error;
3624 }
3625 }
3626 }
3627 if (ch != '-') {
3628 /* '-' is absorbed; other terminating
3629 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003630 *p++ = ch;
3631 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003632 }
3633 }
3634 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003636 s++; /* consume '+' */
3637 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003638 s++;
3639 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003640 }
3641 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003642 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003643 shiftOutStart = p;
3644 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003645 }
3646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003647 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003648 *p++ = ch;
3649 s++;
3650 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003651 else {
3652 startinpos = s-starts;
3653 s++;
3654 errmsg = "unexpected special character";
3655 goto utf7Error;
3656 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003657 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003658utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 outpos = p-PyUnicode_AS_UNICODE(unicode);
3660 endinpos = s-starts;
3661 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 errors, &errorHandler,
3663 "utf7", errmsg,
3664 &starts, &e, &startinpos, &endinpos, &exc, &s,
3665 &unicode, &outpos, &p))
3666 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003667 }
3668
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 /* end of string */
3670
3671 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3672 /* if we're in an inconsistent state, that's an error */
3673 if (surrogate ||
3674 (base64bits >= 6) ||
3675 (base64bits > 0 && base64buffer != 0)) {
3676 outpos = p-PyUnicode_AS_UNICODE(unicode);
3677 endinpos = size;
3678 if (unicode_decode_call_errorhandler(
3679 errors, &errorHandler,
3680 "utf7", "unterminated shift sequence",
3681 &starts, &e, &startinpos, &endinpos, &exc, &s,
3682 &unicode, &outpos, &p))
3683 goto onError;
3684 if (s < e)
3685 goto restart;
3686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003687 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003688
3689 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003690 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003691 if (inShift) {
3692 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003693 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003694 }
3695 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003696 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003697 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003698 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699
Victor Stinnerfe226c02011-10-03 03:52:20 +02003700 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701 goto onError;
3702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 Py_XDECREF(errorHandler);
3704 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003705#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003706 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 Py_DECREF(unicode);
3708 return NULL;
3709 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003710#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003711 return (PyObject *)unicode;
3712
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 Py_XDECREF(errorHandler);
3715 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003716 Py_DECREF(unicode);
3717 return NULL;
3718}
3719
3720
Alexander Belopolsky40018472011-02-26 01:02:56 +00003721PyObject *
3722PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003723 Py_ssize_t size,
3724 int base64SetO,
3725 int base64WhiteSpace,
3726 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003728 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003730 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003732 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003733 unsigned int base64bits = 0;
3734 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003735 char * out;
3736 char * start;
3737
3738 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003739 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003740
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003741 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003742 return PyErr_NoMemory();
3743
Antoine Pitrou244651a2009-05-04 18:56:13 +00003744 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003745 if (v == NULL)
3746 return NULL;
3747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003748 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003749 for (;i < size; ++i) {
3750 Py_UNICODE ch = s[i];
3751
Antoine Pitrou244651a2009-05-04 18:56:13 +00003752 if (inShift) {
3753 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3754 /* shifting out */
3755 if (base64bits) { /* output remaining bits */
3756 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3757 base64buffer = 0;
3758 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759 }
3760 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003761 /* Characters not in the BASE64 set implicitly unshift the sequence
3762 so no '-' is required, except if the character is itself a '-' */
3763 if (IS_BASE64(ch) || ch == '-') {
3764 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003766 *out++ = (char) ch;
3767 }
3768 else {
3769 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003770 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003772 else { /* not in a shift sequence */
3773 if (ch == '+') {
3774 *out++ = '+';
3775 *out++ = '-';
3776 }
3777 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3778 *out++ = (char) ch;
3779 }
3780 else {
3781 *out++ = '+';
3782 inShift = 1;
3783 goto encode_char;
3784 }
3785 }
3786 continue;
3787encode_char:
3788#ifdef Py_UNICODE_WIDE
3789 if (ch >= 0x10000) {
3790 /* code first surrogate */
3791 base64bits += 16;
3792 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3793 while (base64bits >= 6) {
3794 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3795 base64bits -= 6;
3796 }
3797 /* prepare second surrogate */
3798 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3799 }
3800#endif
3801 base64bits += 16;
3802 base64buffer = (base64buffer << 16) | ch;
3803 while (base64bits >= 6) {
3804 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3805 base64bits -= 6;
3806 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808 if (base64bits)
3809 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3810 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003812 if (_PyBytes_Resize(&v, out - start) < 0)
3813 return NULL;
3814 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815}
3816
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817#undef IS_BASE64
3818#undef FROM_BASE64
3819#undef TO_BASE64
3820#undef DECODE_DIRECT
3821#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823/* --- UTF-8 Codec -------------------------------------------------------- */
3824
Tim Petersced69f82003-09-16 20:30:58 +00003825static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003827 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3828 illegal prefix. See RFC 3629 for details */
3829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3834 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3835 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3841 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3842 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3843 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3844 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845};
3846
Alexander Belopolsky40018472011-02-26 01:02:56 +00003847PyObject *
3848PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003849 Py_ssize_t size,
3850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851{
Walter Dörwald69652032004-09-07 20:24:22 +00003852 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3853}
3854
Antoine Pitrouab868312009-01-10 15:40:25 +00003855/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3856#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3857
3858/* Mask to quickly check whether a C 'long' contains a
3859 non-ASCII, UTF8-encoded char. */
3860#if (SIZEOF_LONG == 8)
3861# define ASCII_CHAR_MASK 0x8080808080808080L
3862#elif (SIZEOF_LONG == 4)
3863# define ASCII_CHAR_MASK 0x80808080L
3864#else
3865# error C 'long' size should be either 4 or 8!
3866#endif
3867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868/* Scans a UTF-8 string and returns the maximum character to be expected,
3869 the size of the decoded unicode string and if any major errors were
3870 encountered.
3871
3872 This function does check basic UTF-8 sanity, it does however NOT CHECK
3873 if the string contains surrogates, and if all continuation bytes are
3874 within the correct ranges, these checks are performed in
3875 PyUnicode_DecodeUTF8Stateful.
3876
3877 If it sets has_errors to 1, it means the value of unicode_size and max_char
3878 will be bogus and you should not rely on useful information in them.
3879 */
3880static Py_UCS4
3881utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3882 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3883 int *has_errors)
3884{
3885 Py_ssize_t n;
3886 Py_ssize_t char_count = 0;
3887 Py_UCS4 max_char = 127, new_max;
3888 Py_UCS4 upper_bound;
3889 const unsigned char *p = (const unsigned char *)s;
3890 const unsigned char *end = p + string_size;
3891 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3892 int err = 0;
3893
3894 for (; p < end && !err; ++p, ++char_count) {
3895 /* Only check value if it's not a ASCII char... */
3896 if (*p < 0x80) {
3897 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3898 an explanation. */
3899 if (!((size_t) p & LONG_PTR_MASK)) {
3900 /* Help register allocation */
3901 register const unsigned char *_p = p;
3902 while (_p < aligned_end) {
3903 unsigned long value = *(unsigned long *) _p;
3904 if (value & ASCII_CHAR_MASK)
3905 break;
3906 _p += SIZEOF_LONG;
3907 char_count += SIZEOF_LONG;
3908 }
3909 p = _p;
3910 if (p == end)
3911 break;
3912 }
3913 }
3914 if (*p >= 0x80) {
3915 n = utf8_code_length[*p];
3916 new_max = max_char;
3917 switch (n) {
3918 /* invalid start byte */
3919 case 0:
3920 err = 1;
3921 break;
3922 case 2:
3923 /* Code points between 0x00FF and 0x07FF inclusive.
3924 Approximate the upper bound of the code point,
3925 if this flips over 255 we can be sure it will be more
3926 than 255 and the string will need 2 bytes per code coint,
3927 if it stays under or equal to 255, we can be sure 1 byte
3928 is enough.
3929 ((*p & 0b00011111) << 6) | 0b00111111 */
3930 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3931 if (max_char < upper_bound)
3932 new_max = upper_bound;
3933 /* Ensure we track at least that we left ASCII space. */
3934 if (new_max < 128)
3935 new_max = 128;
3936 break;
3937 case 3:
3938 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3939 always > 255 and <= 65535 and will always need 2 bytes. */
3940 if (max_char < 65535)
3941 new_max = 65535;
3942 break;
3943 case 4:
3944 /* Code point will be above 0xFFFF for sure in this case. */
3945 new_max = 65537;
3946 break;
3947 /* Internal error, this should be caught by the first if */
3948 case 1:
3949 default:
3950 assert(0 && "Impossible case in utf8_max_char_and_size");
3951 err = 1;
3952 }
3953 /* Instead of number of overall bytes for this code point,
3954 n containts the number of following bytes: */
3955 --n;
3956 /* Check if the follow up chars are all valid continuation bytes */
3957 if (n >= 1) {
3958 const unsigned char *cont;
3959 if ((p + n) >= end) {
3960 if (consumed == 0)
3961 /* incomplete data, non-incremental decoding */
3962 err = 1;
3963 break;
3964 }
3965 for (cont = p + 1; cont < (p + n); ++cont) {
3966 if ((*cont & 0xc0) != 0x80) {
3967 err = 1;
3968 break;
3969 }
3970 }
3971 p += n;
3972 }
3973 else
3974 err = 1;
3975 max_char = new_max;
3976 }
3977 }
3978
3979 if (unicode_size)
3980 *unicode_size = char_count;
3981 if (has_errors)
3982 *has_errors = err;
3983 return max_char;
3984}
3985
3986/* Similar to PyUnicode_WRITE but can also write into wstr field
3987 of the legacy unicode representation */
3988#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3989 do { \
3990 const int k_ = (kind); \
3991 if (k_ == PyUnicode_WCHAR_KIND) \
3992 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3993 else if (k_ == PyUnicode_1BYTE_KIND) \
3994 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3995 else if (k_ == PyUnicode_2BYTE_KIND) \
3996 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3997 else \
3998 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3999 } while (0)
4000
Alexander Belopolsky40018472011-02-26 01:02:56 +00004001PyObject *
4002PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 Py_ssize_t size,
4004 const char *errors,
4005 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004009 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004010 Py_ssize_t startinpos;
4011 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004012 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004014 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 PyObject *errorHandler = NULL;
4016 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 Py_UCS4 maxchar = 0;
4018 Py_ssize_t unicode_size;
4019 Py_ssize_t i;
4020 int kind;
4021 void *data;
4022 int has_errors;
4023 Py_UNICODE *error_outptr;
4024#if SIZEOF_WCHAR_T == 2
4025 Py_ssize_t wchar_offset = 0;
4026#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027
Walter Dörwald69652032004-09-07 20:24:22 +00004028 if (size == 0) {
4029 if (consumed)
4030 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4034 consumed, &has_errors);
4035 if (has_errors) {
4036 unicode = _PyUnicode_New(size);
4037 if (!unicode)
4038 return NULL;
4039 kind = PyUnicode_WCHAR_KIND;
4040 data = PyUnicode_AS_UNICODE(unicode);
4041 assert(data != NULL);
4042 }
4043 else {
4044 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4045 if (!unicode)
4046 return NULL;
4047 /* When the string is ASCII only, just use memcpy and return.
4048 unicode_size may be != size if there is an incomplete UTF-8
4049 sequence at the end of the ASCII block. */
4050 if (maxchar < 128 && size == unicode_size) {
4051 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4052 return (PyObject *)unicode;
4053 }
4054 kind = PyUnicode_KIND(unicode);
4055 data = PyUnicode_DATA(unicode);
4056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004060 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061
4062 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004063 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064
4065 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004066 /* Fast path for runs of ASCII characters. Given that common UTF-8
4067 input will consist of an overwhelming majority of ASCII
4068 characters, we try to optimize for this case by checking
4069 as many characters as a C 'long' can contain.
4070 First, check if we can do an aligned read, as most CPUs have
4071 a penalty for unaligned reads.
4072 */
4073 if (!((size_t) s & LONG_PTR_MASK)) {
4074 /* Help register allocation */
4075 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004077 while (_s < aligned_end) {
4078 /* Read a whole long at a time (either 4 or 8 bytes),
4079 and do a fast unrolled copy if it only contains ASCII
4080 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 unsigned long value = *(unsigned long *) _s;
4082 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004083 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4085 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4086 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4087 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004088#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4090 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4091 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4092 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004093#endif
4094 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004096 }
4097 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004099 if (s == e)
4100 break;
4101 ch = (unsigned char)*s;
4102 }
4103 }
4104
4105 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 s++;
4108 continue;
4109 }
4110
4111 n = utf8_code_length[ch];
4112
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004113 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 if (consumed)
4115 break;
4116 else {
4117 errmsg = "unexpected end of data";
4118 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004119 endinpos = startinpos+1;
4120 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4121 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 goto utf8Error;
4123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
4126 switch (n) {
4127
4128 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004129 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 startinpos = s-starts;
4131 endinpos = startinpos+1;
4132 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133
4134 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004135 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 startinpos = s-starts;
4137 endinpos = startinpos+1;
4138 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139
4140 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004141 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004142 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004144 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 goto utf8Error;
4146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004148 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 break;
4151
4152 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004153 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4154 will result in surrogates in range d800-dfff. Surrogates are
4155 not valid UTF-8 so they are rejected.
4156 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004158 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004159 (s[2] & 0xc0) != 0x80 ||
4160 ((unsigned char)s[0] == 0xE0 &&
4161 (unsigned char)s[1] < 0xA0) ||
4162 ((unsigned char)s[0] == 0xED &&
4163 (unsigned char)s[1] > 0x9F)) {
4164 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004166 endinpos = startinpos + 1;
4167
4168 /* if s[1] first two bits are 1 and 0, then the invalid
4169 continuation byte is s[2], so increment endinpos by 1,
4170 if not, s[1] is invalid and endinpos doesn't need to
4171 be incremented. */
4172 if ((s[1] & 0xC0) == 0x80)
4173 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 goto utf8Error;
4175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004177 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004179 break;
4180
4181 case 4:
4182 if ((s[1] & 0xc0) != 0x80 ||
4183 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004184 (s[3] & 0xc0) != 0x80 ||
4185 ((unsigned char)s[0] == 0xF0 &&
4186 (unsigned char)s[1] < 0x90) ||
4187 ((unsigned char)s[0] == 0xF4 &&
4188 (unsigned char)s[1] > 0x8F)) {
4189 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004191 endinpos = startinpos + 1;
4192 if ((s[1] & 0xC0) == 0x80) {
4193 endinpos++;
4194 if ((s[2] & 0xC0) == 0x80)
4195 endinpos++;
4196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 goto utf8Error;
4198 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004199 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004200 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4201 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004203 /* If the string is flexible or we have native UCS-4, write
4204 directly.. */
4205 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4206 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 else {
4209 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 /* translate from 10000..10FFFF to 0..FFFF */
4212 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 /* high surrogate = top 10 bits added to D800 */
4215 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4216 (Py_UNICODE)(0xD800 + (ch >> 10)));
4217
4218 /* low surrogate = bottom 10 bits added to DC00 */
4219 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4220 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4221 }
4222#if SIZEOF_WCHAR_T == 2
4223 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004224#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 }
4227 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004229
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231 /* If this is not yet a resizable string, make it one.. */
4232 if (kind != PyUnicode_WCHAR_KIND) {
4233 const Py_UNICODE *u;
4234 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4235 if (!new_unicode)
4236 goto onError;
4237 u = PyUnicode_AsUnicode((PyObject *)unicode);
4238 if (!u)
4239 goto onError;
4240#if SIZEOF_WCHAR_T == 2
4241 i += wchar_offset;
4242#endif
4243 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4244 Py_DECREF(unicode);
4245 unicode = new_unicode;
4246 kind = 0;
4247 data = PyUnicode_AS_UNICODE(new_unicode);
4248 assert(data != NULL);
4249 }
4250 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "utf8", errmsg,
4254 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004257 /* Update data because unicode_decode_call_errorhandler might have
4258 re-created or resized the unicode object. */
4259 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 /* Ensure the unicode_size calculation above was correct: */
4263 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4264
Walter Dörwald69652032004-09-07 20:24:22 +00004265 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 /* Adjust length and ready string when it contained errors and
4269 is of the old resizable kind. */
4270 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004271 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004272 goto onError;
4273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 Py_XDECREF(errorHandler);
4276 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004277#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004278 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 Py_DECREF(unicode);
4280 return NULL;
4281 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004282#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 return (PyObject *)unicode;
4284
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 Py_XDECREF(errorHandler);
4287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 Py_DECREF(unicode);
4289 return NULL;
4290}
4291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004292#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004293
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004294#ifdef __APPLE__
4295
4296/* Simplified UTF-8 decoder using surrogateescape error handler,
4297 used to decode the command line arguments on Mac OS X. */
4298
4299wchar_t*
4300_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4301{
4302 int n;
4303 const char *e;
4304 wchar_t *unicode, *p;
4305
4306 /* Note: size will always be longer than the resulting Unicode
4307 character count */
4308 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4309 PyErr_NoMemory();
4310 return NULL;
4311 }
4312 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4313 if (!unicode)
4314 return NULL;
4315
4316 /* Unpack UTF-8 encoded data */
4317 p = unicode;
4318 e = s + size;
4319 while (s < e) {
4320 Py_UCS4 ch = (unsigned char)*s;
4321
4322 if (ch < 0x80) {
4323 *p++ = (wchar_t)ch;
4324 s++;
4325 continue;
4326 }
4327
4328 n = utf8_code_length[ch];
4329 if (s + n > e) {
4330 goto surrogateescape;
4331 }
4332
4333 switch (n) {
4334 case 0:
4335 case 1:
4336 goto surrogateescape;
4337
4338 case 2:
4339 if ((s[1] & 0xc0) != 0x80)
4340 goto surrogateescape;
4341 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4342 assert ((ch > 0x007F) && (ch <= 0x07FF));
4343 *p++ = (wchar_t)ch;
4344 break;
4345
4346 case 3:
4347 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4348 will result in surrogates in range d800-dfff. Surrogates are
4349 not valid UTF-8 so they are rejected.
4350 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4351 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4352 if ((s[1] & 0xc0) != 0x80 ||
4353 (s[2] & 0xc0) != 0x80 ||
4354 ((unsigned char)s[0] == 0xE0 &&
4355 (unsigned char)s[1] < 0xA0) ||
4356 ((unsigned char)s[0] == 0xED &&
4357 (unsigned char)s[1] > 0x9F)) {
4358
4359 goto surrogateescape;
4360 }
4361 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4362 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004364 break;
4365
4366 case 4:
4367 if ((s[1] & 0xc0) != 0x80 ||
4368 (s[2] & 0xc0) != 0x80 ||
4369 (s[3] & 0xc0) != 0x80 ||
4370 ((unsigned char)s[0] == 0xF0 &&
4371 (unsigned char)s[1] < 0x90) ||
4372 ((unsigned char)s[0] == 0xF4 &&
4373 (unsigned char)s[1] > 0x8F)) {
4374 goto surrogateescape;
4375 }
4376 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4377 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4378 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4379
4380#if SIZEOF_WCHAR_T == 4
4381 *p++ = (wchar_t)ch;
4382#else
4383 /* compute and append the two surrogates: */
4384
4385 /* translate from 10000..10FFFF to 0..FFFF */
4386 ch -= 0x10000;
4387
4388 /* high surrogate = top 10 bits added to D800 */
4389 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4390
4391 /* low surrogate = bottom 10 bits added to DC00 */
4392 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4393#endif
4394 break;
4395 }
4396 s += n;
4397 continue;
4398
4399 surrogateescape:
4400 *p++ = 0xDC00 + ch;
4401 s++;
4402 }
4403 *p = L'\0';
4404 return unicode;
4405}
4406
4407#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409/* Primary internal function which creates utf8 encoded bytes objects.
4410
4411 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004412 and allocate exactly as much space needed at the end. Else allocate the
4413 maximum possible needed (4 result bytes per Unicode character), and return
4414 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004415*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004416PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418{
Tim Peters602f7402002-04-27 18:03:26 +00004419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004420
Guido van Rossum98297ee2007-11-06 21:34:58 +00004421 Py_ssize_t i; /* index into s of next input byte */
4422 PyObject *result; /* result string object */
4423 char *p; /* next free byte in output buffer */
4424 Py_ssize_t nallocated; /* number of result bytes allocated */
4425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004427 PyObject *errorHandler = NULL;
4428 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 int kind;
4430 void *data;
4431 Py_ssize_t size;
4432 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4433#if SIZEOF_WCHAR_T == 2
4434 Py_ssize_t wchar_offset = 0;
4435#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004437 if (!PyUnicode_Check(unicode)) {
4438 PyErr_BadArgument();
4439 return NULL;
4440 }
4441
4442 if (PyUnicode_READY(unicode) == -1)
4443 return NULL;
4444
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004445 if (PyUnicode_UTF8(unicode))
4446 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4447 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004448
4449 kind = PyUnicode_KIND(unicode);
4450 data = PyUnicode_DATA(unicode);
4451 size = PyUnicode_GET_LENGTH(unicode);
4452
Tim Peters602f7402002-04-27 18:03:26 +00004453 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
Tim Peters602f7402002-04-27 18:03:26 +00004455 if (size <= MAX_SHORT_UNICHARS) {
4456 /* Write into the stack buffer; nallocated can't overflow.
4457 * At the end, we'll allocate exactly as much heap space as it
4458 * turns out we need.
4459 */
4460 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004461 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004462 p = stackbuf;
4463 }
4464 else {
4465 /* Overallocate on the heap, and give the excess back at the end. */
4466 nallocated = size * 4;
4467 if (nallocated / 4 != size) /* overflow! */
4468 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004469 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004470 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004471 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004472 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004473 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004474
Tim Peters602f7402002-04-27 18:03:26 +00004475 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004476 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004477
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004478 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004479 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004481
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004483 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004484 *p++ = (char)(0xc0 | (ch >> 6));
4485 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004486 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004487 Py_ssize_t newpos;
4488 PyObject *rep;
4489 Py_ssize_t repsize, k, startpos;
4490 startpos = i-1;
4491#if SIZEOF_WCHAR_T == 2
4492 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004493#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 rep = unicode_encode_call_errorhandler(
4495 errors, &errorHandler, "utf-8", "surrogates not allowed",
4496 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4497 &exc, startpos, startpos+1, &newpos);
4498 if (!rep)
4499 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004501 if (PyBytes_Check(rep))
4502 repsize = PyBytes_GET_SIZE(rep);
4503 else
4504 repsize = PyUnicode_GET_SIZE(rep);
4505
4506 if (repsize > 4) {
4507 Py_ssize_t offset;
4508
4509 if (result == NULL)
4510 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004511 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004512 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4515 /* integer overflow */
4516 PyErr_NoMemory();
4517 goto error;
4518 }
4519 nallocated += repsize - 4;
4520 if (result != NULL) {
4521 if (_PyBytes_Resize(&result, nallocated) < 0)
4522 goto error;
4523 } else {
4524 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004525 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 goto error;
4527 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4528 }
4529 p = PyBytes_AS_STRING(result) + offset;
4530 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532 if (PyBytes_Check(rep)) {
4533 char *prep = PyBytes_AS_STRING(rep);
4534 for(k = repsize; k > 0; k--)
4535 *p++ = *prep++;
4536 } else /* rep is unicode */ {
4537 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4538 Py_UNICODE c;
4539
4540 for(k=0; k<repsize; k++) {
4541 c = prep[k];
4542 if (0x80 <= c) {
4543 raise_encode_exception(&exc, "utf-8",
4544 PyUnicode_AS_UNICODE(unicode),
4545 size, i-1, i,
4546 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004547 goto error;
4548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004549 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004550 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004552 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004553 } else if (ch < 0x10000) {
4554 *p++ = (char)(0xe0 | (ch >> 12));
4555 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4556 *p++ = (char)(0x80 | (ch & 0x3f));
4557 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004558 /* Encode UCS4 Unicode ordinals */
4559 *p++ = (char)(0xf0 | (ch >> 18));
4560 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4561 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4562 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004563#if SIZEOF_WCHAR_T == 2
4564 wchar_offset++;
4565#endif
Tim Peters602f7402002-04-27 18:03:26 +00004566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004568
Guido van Rossum98297ee2007-11-06 21:34:58 +00004569 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004570 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004571 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004572 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004573 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004574 }
4575 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004576 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004577 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004578 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004579 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004581
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004582 Py_XDECREF(errorHandler);
4583 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004584 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004585 error:
4586 Py_XDECREF(errorHandler);
4587 Py_XDECREF(exc);
4588 Py_XDECREF(result);
4589 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004590
Tim Peters602f7402002-04-27 18:03:26 +00004591#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592}
4593
Alexander Belopolsky40018472011-02-26 01:02:56 +00004594PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004595PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4596 Py_ssize_t size,
4597 const char *errors)
4598{
4599 PyObject *v, *unicode;
4600
4601 unicode = PyUnicode_FromUnicode(s, size);
4602 if (unicode == NULL)
4603 return NULL;
4604 v = _PyUnicode_AsUTF8String(unicode, errors);
4605 Py_DECREF(unicode);
4606 return v;
4607}
4608
4609PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004610PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613}
4614
Walter Dörwald41980ca2007-08-16 21:55:45 +00004615/* --- UTF-32 Codec ------------------------------------------------------- */
4616
4617PyObject *
4618PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 Py_ssize_t size,
4620 const char *errors,
4621 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004622{
4623 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4624}
4625
4626PyObject *
4627PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 Py_ssize_t size,
4629 const char *errors,
4630 int *byteorder,
4631 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004632{
4633 const char *starts = s;
4634 Py_ssize_t startinpos;
4635 Py_ssize_t endinpos;
4636 Py_ssize_t outpos;
4637 PyUnicodeObject *unicode;
4638 Py_UNICODE *p;
4639#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004640 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004641 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004642#else
4643 const int pairs = 0;
4644#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004645 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 int bo = 0; /* assume native ordering by default */
4647 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648 /* Offsets from q for retrieving bytes in the right order. */
4649#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4650 int iorder[] = {0, 1, 2, 3};
4651#else
4652 int iorder[] = {3, 2, 1, 0};
4653#endif
4654 PyObject *errorHandler = NULL;
4655 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004656
Walter Dörwald41980ca2007-08-16 21:55:45 +00004657 q = (unsigned char *)s;
4658 e = q + size;
4659
4660 if (byteorder)
4661 bo = *byteorder;
4662
4663 /* Check for BOM marks (U+FEFF) in the input and adjust current
4664 byte order setting accordingly. In native mode, the leading BOM
4665 mark is skipped, in all other modes, it is copied to the output
4666 stream as-is (giving a ZWNBSP character). */
4667 if (bo == 0) {
4668 if (size >= 4) {
4669 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004671#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 if (bom == 0x0000FEFF) {
4673 q += 4;
4674 bo = -1;
4675 }
4676 else if (bom == 0xFFFE0000) {
4677 q += 4;
4678 bo = 1;
4679 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004680#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 if (bom == 0x0000FEFF) {
4682 q += 4;
4683 bo = 1;
4684 }
4685 else if (bom == 0xFFFE0000) {
4686 q += 4;
4687 bo = -1;
4688 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004689#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691 }
4692
4693 if (bo == -1) {
4694 /* force LE */
4695 iorder[0] = 0;
4696 iorder[1] = 1;
4697 iorder[2] = 2;
4698 iorder[3] = 3;
4699 }
4700 else if (bo == 1) {
4701 /* force BE */
4702 iorder[0] = 3;
4703 iorder[1] = 2;
4704 iorder[2] = 1;
4705 iorder[3] = 0;
4706 }
4707
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004708 /* On narrow builds we split characters outside the BMP into two
4709 codepoints => count how much extra space we need. */
4710#ifndef Py_UNICODE_WIDE
4711 for (qq = q; qq < e; qq += 4)
4712 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4713 pairs++;
4714#endif
4715
4716 /* This might be one to much, because of a BOM */
4717 unicode = _PyUnicode_New((size+3)/4+pairs);
4718 if (!unicode)
4719 return NULL;
4720 if (size == 0)
4721 return (PyObject *)unicode;
4722
4723 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004724 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004725
Walter Dörwald41980ca2007-08-16 21:55:45 +00004726 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 Py_UCS4 ch;
4728 /* remaining bytes at the end? (size should be divisible by 4) */
4729 if (e-q<4) {
4730 if (consumed)
4731 break;
4732 errmsg = "truncated data";
4733 startinpos = ((const char *)q)-starts;
4734 endinpos = ((const char *)e)-starts;
4735 goto utf32Error;
4736 /* The remaining input chars are ignored if the callback
4737 chooses to skip the input */
4738 }
4739 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4740 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 if (ch >= 0x110000)
4743 {
4744 errmsg = "codepoint not in range(0x110000)";
4745 startinpos = ((const char *)q)-starts;
4746 endinpos = startinpos+4;
4747 goto utf32Error;
4748 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004749#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 if (ch >= 0x10000)
4751 {
4752 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4753 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4754 }
4755 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 *p++ = ch;
4758 q += 4;
4759 continue;
4760 utf32Error:
4761 outpos = p-PyUnicode_AS_UNICODE(unicode);
4762 if (unicode_decode_call_errorhandler(
4763 errors, &errorHandler,
4764 "utf32", errmsg,
4765 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4766 &unicode, &outpos, &p))
4767 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004768 }
4769
4770 if (byteorder)
4771 *byteorder = bo;
4772
4773 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004775
4776 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004777 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004778 goto onError;
4779
4780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004782#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004783 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784 Py_DECREF(unicode);
4785 return NULL;
4786 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004787#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788 return (PyObject *)unicode;
4789
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004791 Py_DECREF(unicode);
4792 Py_XDECREF(errorHandler);
4793 Py_XDECREF(exc);
4794 return NULL;
4795}
4796
4797PyObject *
4798PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 Py_ssize_t size,
4800 const char *errors,
4801 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004804 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004805 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004807 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808#else
4809 const int pairs = 0;
4810#endif
4811 /* Offsets from p for storing byte pairs in the right order. */
4812#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4813 int iorder[] = {0, 1, 2, 3};
4814#else
4815 int iorder[] = {3, 2, 1, 0};
4816#endif
4817
Benjamin Peterson29060642009-01-31 22:14:21 +00004818#define STORECHAR(CH) \
4819 do { \
4820 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4821 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4822 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4823 p[iorder[0]] = (CH) & 0xff; \
4824 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004825 } while(0)
4826
4827 /* In narrow builds we can output surrogate pairs as one codepoint,
4828 so we need less space. */
4829#ifndef Py_UNICODE_WIDE
4830 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4832 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4833 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004835 nsize = (size - pairs + (byteorder == 0));
4836 bytesize = nsize * 4;
4837 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004839 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004840 if (v == NULL)
4841 return NULL;
4842
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004843 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004844 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004846 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848
4849 if (byteorder == -1) {
4850 /* force LE */
4851 iorder[0] = 0;
4852 iorder[1] = 1;
4853 iorder[2] = 2;
4854 iorder[3] = 3;
4855 }
4856 else if (byteorder == 1) {
4857 /* force BE */
4858 iorder[0] = 3;
4859 iorder[1] = 2;
4860 iorder[2] = 1;
4861 iorder[3] = 0;
4862 }
4863
4864 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4868 Py_UCS4 ch2 = *s;
4869 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4870 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4871 s++;
4872 size--;
4873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004874 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875#endif
4876 STORECHAR(ch);
4877 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004878
4879 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004880 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004881#undef STORECHAR
4882}
4883
Alexander Belopolsky40018472011-02-26 01:02:56 +00004884PyObject *
4885PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886{
4887 if (!PyUnicode_Check(unicode)) {
4888 PyErr_BadArgument();
4889 return NULL;
4890 }
4891 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 PyUnicode_GET_SIZE(unicode),
4893 NULL,
4894 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004895}
4896
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897/* --- UTF-16 Codec ------------------------------------------------------- */
4898
Tim Peters772747b2001-08-09 22:21:55 +00004899PyObject *
4900PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 Py_ssize_t size,
4902 const char *errors,
4903 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
Walter Dörwald69652032004-09-07 20:24:22 +00004905 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4906}
4907
Antoine Pitrouab868312009-01-10 15:40:25 +00004908/* Two masks for fast checking of whether a C 'long' may contain
4909 UTF16-encoded surrogate characters. This is an efficient heuristic,
4910 assuming that non-surrogate characters with a code point >= 0x8000 are
4911 rare in most input.
4912 FAST_CHAR_MASK is used when the input is in native byte ordering,
4913 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004914*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004915#if (SIZEOF_LONG == 8)
4916# define FAST_CHAR_MASK 0x8000800080008000L
4917# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4918#elif (SIZEOF_LONG == 4)
4919# define FAST_CHAR_MASK 0x80008000L
4920# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4921#else
4922# error C 'long' size should be either 4 or 8!
4923#endif
4924
Walter Dörwald69652032004-09-07 20:24:22 +00004925PyObject *
4926PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_ssize_t size,
4928 const char *errors,
4929 int *byteorder,
4930 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t startinpos;
4934 Py_ssize_t endinpos;
4935 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 PyUnicodeObject *unicode;
4937 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004938 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004939 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004940 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004941 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004942 /* Offsets from q for retrieving byte pairs in the right order. */
4943#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4944 int ihi = 1, ilo = 0;
4945#else
4946 int ihi = 0, ilo = 1;
4947#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 PyObject *errorHandler = NULL;
4949 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950
4951 /* Note: size will always be longer than the resulting Unicode
4952 character count */
4953 unicode = _PyUnicode_New(size);
4954 if (!unicode)
4955 return NULL;
4956 if (size == 0)
4957 return (PyObject *)unicode;
4958
4959 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004961 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004962 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963
4964 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004965 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004967 /* Check for BOM marks (U+FEFF) in the input and adjust current
4968 byte order setting accordingly. In native mode, the leading BOM
4969 mark is skipped, in all other modes, it is copied to the output
4970 stream as-is (giving a ZWNBSP character). */
4971 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004972 if (size >= 2) {
4973 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004974#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 if (bom == 0xFEFF) {
4976 q += 2;
4977 bo = -1;
4978 }
4979 else if (bom == 0xFFFE) {
4980 q += 2;
4981 bo = 1;
4982 }
Tim Petersced69f82003-09-16 20:30:58 +00004983#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 if (bom == 0xFEFF) {
4985 q += 2;
4986 bo = 1;
4987 }
4988 else if (bom == 0xFFFE) {
4989 q += 2;
4990 bo = -1;
4991 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004992#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995
Tim Peters772747b2001-08-09 22:21:55 +00004996 if (bo == -1) {
4997 /* force LE */
4998 ihi = 1;
4999 ilo = 0;
5000 }
5001 else if (bo == 1) {
5002 /* force BE */
5003 ihi = 0;
5004 ilo = 1;
5005 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005006#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5007 native_ordering = ilo < ihi;
5008#else
5009 native_ordering = ilo > ihi;
5010#endif
Tim Peters772747b2001-08-09 22:21:55 +00005011
Antoine Pitrouab868312009-01-10 15:40:25 +00005012 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005013 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005015 /* First check for possible aligned read of a C 'long'. Unaligned
5016 reads are more expensive, better to defer to another iteration. */
5017 if (!((size_t) q & LONG_PTR_MASK)) {
5018 /* Fast path for runs of non-surrogate chars. */
5019 register const unsigned char *_q = q;
5020 Py_UNICODE *_p = p;
5021 if (native_ordering) {
5022 /* Native ordering is simple: as long as the input cannot
5023 possibly contain a surrogate char, do an unrolled copy
5024 of several 16-bit code points to the target object.
5025 The non-surrogate check is done on several input bytes
5026 at a time (as many as a C 'long' can contain). */
5027 while (_q < aligned_end) {
5028 unsigned long data = * (unsigned long *) _q;
5029 if (data & FAST_CHAR_MASK)
5030 break;
5031 _p[0] = ((unsigned short *) _q)[0];
5032 _p[1] = ((unsigned short *) _q)[1];
5033#if (SIZEOF_LONG == 8)
5034 _p[2] = ((unsigned short *) _q)[2];
5035 _p[3] = ((unsigned short *) _q)[3];
5036#endif
5037 _q += SIZEOF_LONG;
5038 _p += SIZEOF_LONG / 2;
5039 }
5040 }
5041 else {
5042 /* Byteswapped ordering is similar, but we must decompose
5043 the copy bytewise, and take care of zero'ing out the
5044 upper bytes if the target object is in 32-bit units
5045 (that is, in UCS-4 builds). */
5046 while (_q < aligned_end) {
5047 unsigned long data = * (unsigned long *) _q;
5048 if (data & SWAPPED_FAST_CHAR_MASK)
5049 break;
5050 /* Zero upper bytes in UCS-4 builds */
5051#if (Py_UNICODE_SIZE > 2)
5052 _p[0] = 0;
5053 _p[1] = 0;
5054#if (SIZEOF_LONG == 8)
5055 _p[2] = 0;
5056 _p[3] = 0;
5057#endif
5058#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005059 /* Issue #4916; UCS-4 builds on big endian machines must
5060 fill the two last bytes of each 4-byte unit. */
5061#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5062# define OFF 2
5063#else
5064# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005065#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005066 ((unsigned char *) _p)[OFF + 1] = _q[0];
5067 ((unsigned char *) _p)[OFF + 0] = _q[1];
5068 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5069 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5070#if (SIZEOF_LONG == 8)
5071 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5072 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5073 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5074 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5075#endif
5076#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005077 _q += SIZEOF_LONG;
5078 _p += SIZEOF_LONG / 2;
5079 }
5080 }
5081 p = _p;
5082 q = _q;
5083 if (q >= e)
5084 break;
5085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087
Benjamin Peterson14339b62009-01-31 16:36:08 +00005088 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005089
5090 if (ch < 0xD800 || ch > 0xDFFF) {
5091 *p++ = ch;
5092 continue;
5093 }
5094
5095 /* UTF-16 code pair: */
5096 if (q > e) {
5097 errmsg = "unexpected end of data";
5098 startinpos = (((const char *)q) - 2) - starts;
5099 endinpos = ((const char *)e) + 1 - starts;
5100 goto utf16Error;
5101 }
5102 if (0xD800 <= ch && ch <= 0xDBFF) {
5103 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5104 q += 2;
5105 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005106#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 *p++ = ch;
5108 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005109#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005111#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 continue;
5113 }
5114 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005115 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 startinpos = (((const char *)q)-4)-starts;
5117 endinpos = startinpos+2;
5118 goto utf16Error;
5119 }
5120
Benjamin Peterson14339b62009-01-31 16:36:08 +00005121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 errmsg = "illegal encoding";
5123 startinpos = (((const char *)q)-2)-starts;
5124 endinpos = startinpos+2;
5125 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005126
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 utf16Error:
5128 outpos = p - PyUnicode_AS_UNICODE(unicode);
5129 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005130 errors,
5131 &errorHandler,
5132 "utf16", errmsg,
5133 &starts,
5134 (const char **)&e,
5135 &startinpos,
5136 &endinpos,
5137 &exc,
5138 (const char **)&q,
5139 &unicode,
5140 &outpos,
5141 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005144 /* remaining byte at the end? (size should be even) */
5145 if (e == q) {
5146 if (!consumed) {
5147 errmsg = "truncated data";
5148 startinpos = ((const char *)q) - starts;
5149 endinpos = ((const char *)e) + 1 - starts;
5150 outpos = p - PyUnicode_AS_UNICODE(unicode);
5151 if (unicode_decode_call_errorhandler(
5152 errors,
5153 &errorHandler,
5154 "utf16", errmsg,
5155 &starts,
5156 (const char **)&e,
5157 &startinpos,
5158 &endinpos,
5159 &exc,
5160 (const char **)&q,
5161 &unicode,
5162 &outpos,
5163 &p))
5164 goto onError;
5165 /* The remaining input chars are ignored if the callback
5166 chooses to skip the input */
5167 }
5168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
5170 if (byteorder)
5171 *byteorder = bo;
5172
Walter Dörwald69652032004-09-07 20:24:22 +00005173 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005177 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 goto onError;
5179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 Py_XDECREF(errorHandler);
5181 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005182#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005183 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005184 Py_DECREF(unicode);
5185 return NULL;
5186 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005187#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 return (PyObject *)unicode;
5189
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 Py_XDECREF(errorHandler);
5193 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return NULL;
5195}
5196
Antoine Pitrouab868312009-01-10 15:40:25 +00005197#undef FAST_CHAR_MASK
5198#undef SWAPPED_FAST_CHAR_MASK
5199
Tim Peters772747b2001-08-09 22:21:55 +00005200PyObject *
5201PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005202 Py_ssize_t size,
5203 const char *errors,
5204 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005206 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005207 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005208 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005209#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005210 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005211#else
5212 const int pairs = 0;
5213#endif
Tim Peters772747b2001-08-09 22:21:55 +00005214 /* Offsets from p for storing byte pairs in the right order. */
5215#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5216 int ihi = 1, ilo = 0;
5217#else
5218 int ihi = 0, ilo = 1;
5219#endif
5220
Benjamin Peterson29060642009-01-31 22:14:21 +00005221#define STORECHAR(CH) \
5222 do { \
5223 p[ihi] = ((CH) >> 8) & 0xff; \
5224 p[ilo] = (CH) & 0xff; \
5225 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005226 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005228#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005229 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (s[i] >= 0x10000)
5231 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005232#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005233 /* 2 * (size + pairs + (byteorder == 0)) */
5234 if (size > PY_SSIZE_T_MAX ||
5235 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005237 nsize = size + pairs + (byteorder == 0);
5238 bytesize = nsize * 2;
5239 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005241 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 if (v == NULL)
5243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005245 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005248 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005249 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005250
5251 if (byteorder == -1) {
5252 /* force LE */
5253 ihi = 1;
5254 ilo = 0;
5255 }
5256 else if (byteorder == 1) {
5257 /* force BE */
5258 ihi = 0;
5259 ilo = 1;
5260 }
5261
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005262 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 Py_UNICODE ch = *s++;
5264 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005265#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (ch >= 0x10000) {
5267 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5268 ch = 0xD800 | ((ch-0x10000) >> 10);
5269 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005270#endif
Tim Peters772747b2001-08-09 22:21:55 +00005271 STORECHAR(ch);
5272 if (ch2)
5273 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005274 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005275
5276 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005277 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005278#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279}
5280
Alexander Belopolsky40018472011-02-26 01:02:56 +00005281PyObject *
5282PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283{
5284 if (!PyUnicode_Check(unicode)) {
5285 PyErr_BadArgument();
5286 return NULL;
5287 }
5288 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 PyUnicode_GET_SIZE(unicode),
5290 NULL,
5291 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292}
5293
5294/* --- Unicode Escape Codec ----------------------------------------------- */
5295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005296/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5297 if all the escapes in the string make it still a valid ASCII string.
5298 Returns -1 if any escapes were found which cause the string to
5299 pop out of ASCII range. Otherwise returns the length of the
5300 required buffer to hold the string.
5301 */
5302Py_ssize_t
5303length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5304{
5305 const unsigned char *p = (const unsigned char *)s;
5306 const unsigned char *end = p + size;
5307 Py_ssize_t length = 0;
5308
5309 if (size < 0)
5310 return -1;
5311
5312 for (; p < end; ++p) {
5313 if (*p > 127) {
5314 /* Non-ASCII */
5315 return -1;
5316 }
5317 else if (*p != '\\') {
5318 /* Normal character */
5319 ++length;
5320 }
5321 else {
5322 /* Backslash-escape, check next char */
5323 ++p;
5324 /* Escape sequence reaches till end of string or
5325 non-ASCII follow-up. */
5326 if (p >= end || *p > 127)
5327 return -1;
5328 switch (*p) {
5329 case '\n':
5330 /* backslash + \n result in zero characters */
5331 break;
5332 case '\\': case '\'': case '\"':
5333 case 'b': case 'f': case 't':
5334 case 'n': case 'r': case 'v': case 'a':
5335 ++length;
5336 break;
5337 case '0': case '1': case '2': case '3':
5338 case '4': case '5': case '6': case '7':
5339 case 'x': case 'u': case 'U': case 'N':
5340 /* these do not guarantee ASCII characters */
5341 return -1;
5342 default:
5343 /* count the backslash + the other character */
5344 length += 2;
5345 }
5346 }
5347 }
5348 return length;
5349}
5350
5351/* Similar to PyUnicode_WRITE but either write into wstr field
5352 or treat string as ASCII. */
5353#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5354 do { \
5355 if ((kind) != PyUnicode_WCHAR_KIND) \
5356 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5357 else \
5358 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5359 } while (0)
5360
5361#define WRITE_WSTR(buf, index, value) \
5362 assert(kind == PyUnicode_WCHAR_KIND), \
5363 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5364
5365
Fredrik Lundh06d12682001-01-24 07:59:11 +00005366static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005367
Alexander Belopolsky40018472011-02-26 01:02:56 +00005368PyObject *
5369PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005370 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005371 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374 Py_ssize_t startinpos;
5375 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005376 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005380 char* message;
5381 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 PyObject *errorHandler = NULL;
5383 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005384 Py_ssize_t ascii_length;
5385 Py_ssize_t i;
5386 int kind;
5387 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 ascii_length = length_of_escaped_ascii_string(s, size);
5390
5391 /* After length_of_escaped_ascii_string() there are two alternatives,
5392 either the string is pure ASCII with named escapes like \n, etc.
5393 and we determined it's exact size (common case)
5394 or it contains \x, \u, ... escape sequences. then we create a
5395 legacy wchar string and resize it at the end of this function. */
5396 if (ascii_length >= 0) {
5397 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5398 if (!v)
5399 goto onError;
5400 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5401 kind = PyUnicode_1BYTE_KIND;
5402 data = PyUnicode_DATA(v);
5403 }
5404 else {
5405 /* Escaped strings will always be longer than the resulting
5406 Unicode string, so we start with size here and then reduce the
5407 length after conversion to the true value.
5408 (but if the error callback returns a long replacement string
5409 we'll have to allocate more space) */
5410 v = _PyUnicode_New(size);
5411 if (!v)
5412 goto onError;
5413 kind = PyUnicode_WCHAR_KIND;
5414 data = PyUnicode_AS_UNICODE(v);
5415 }
5416
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 if (size == 0)
5418 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005419 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 while (s < end) {
5423 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005424 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 if (kind == PyUnicode_WCHAR_KIND) {
5428 assert(i < _PyUnicode_WSTR_LENGTH(v));
5429 }
5430 else {
5431 /* The only case in which i == ascii_length is a backslash
5432 followed by a newline. */
5433 assert(i <= ascii_length);
5434 }
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 /* Non-escape characters are interpreted as Unicode ordinals */
5437 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 continue;
5440 }
5441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 /* \ - Escapes */
5444 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005445 c = *s++;
5446 if (s > end)
5447 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448
5449 if (kind == PyUnicode_WCHAR_KIND) {
5450 assert(i < _PyUnicode_WSTR_LENGTH(v));
5451 }
5452 else {
5453 /* The only case in which i == ascii_length is a backslash
5454 followed by a newline. */
5455 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5456 }
5457
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005458 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5463 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5464 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5465 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5466 /* FF */
5467 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5468 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5469 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5470 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5471 /* VT */
5472 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5473 /* BEL, not classic C */
5474 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 case '0': case '1': case '2': case '3':
5478 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005479 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005480 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005481 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005482 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005483 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 break;
5487
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 /* hex escapes */
5489 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005491 digits = 2;
5492 message = "truncated \\xXX escape";
5493 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005497 digits = 4;
5498 message = "truncated \\uXXXX escape";
5499 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005502 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005503 digits = 8;
5504 message = "truncated \\UXXXXXXXX escape";
5505 hexescape:
5506 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005507 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 if (s+digits>end) {
5509 endinpos = size;
5510 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 errors, &errorHandler,
5512 "unicodeescape", "end of string in escape sequence",
5513 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 goto nextByte;
5518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 for (j = 0; j < digits; ++j) {
5520 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005521 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 endinpos = (s+j+1)-starts;
5523 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 errors, &errorHandler,
5526 "unicodeescape", message,
5527 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005532 }
5533 chr = (chr<<4) & ~0xF;
5534 if (c >= '0' && c <= '9')
5535 chr += c - '0';
5536 else if (c >= 'a' && c <= 'f')
5537 chr += 10 + c - 'a';
5538 else
5539 chr += 10 + c - 'A';
5540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005542 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 /* _decoding_error will have already written into the
5544 target buffer. */
5545 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005547 /* when we get here, chr is a 32-bit unicode character */
5548 if (chr <= 0xffff)
5549 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005551 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005552 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005553 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005554#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005556#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005557 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5559 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005560#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005561 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 errors, &errorHandler,
5566 "unicodeescape", "illegal Unicode character",
5567 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005569 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005571 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005572 break;
5573
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005575 case 'N':
5576 message = "malformed \\N character escape";
5577 if (ucnhash_CAPI == NULL) {
5578 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5580 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005581 if (ucnhash_CAPI == NULL)
5582 goto ucnhashError;
5583 }
5584 if (*s == '{') {
5585 const char *start = s+1;
5586 /* look for the closing brace */
5587 while (*s != '}' && s < end)
5588 s++;
5589 if (s > start && s < end && *s == '}') {
5590 /* found a name. look it up in the unicode database */
5591 message = "unknown Unicode character name";
5592 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5594 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 goto store;
5596 }
5597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 errors, &errorHandler,
5602 "unicodeescape", message,
5603 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005604 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005605 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005606 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607 break;
5608
5609 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005610 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005611 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 message = "\\ at end of string";
5613 s--;
5614 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005615 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 errors, &errorHandler,
5618 "unicodeescape", message,
5619 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005621 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005622 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005623 }
5624 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5626 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005627 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005628 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 /* Ensure the length prediction worked in case of ASCII strings */
5634 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5635
Victor Stinnerfe226c02011-10-03 03:52:20 +02005636 if (kind == PyUnicode_WCHAR_KIND)
5637 {
5638 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5639 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005640 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005641 Py_XDECREF(errorHandler);
5642 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005643#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005644 if (_PyUnicode_READY_REPLACE(&v)) {
5645 Py_DECREF(v);
5646 return NULL;
5647 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005648#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005650
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005652 PyErr_SetString(
5653 PyExc_UnicodeError,
5654 "\\N escapes not supported (can't load unicodedata module)"
5655 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005656 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005657 Py_XDECREF(errorHandler);
5658 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005659 return NULL;
5660
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 Py_XDECREF(errorHandler);
5664 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 return NULL;
5666}
5667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005668#undef WRITE_ASCII_OR_WSTR
5669#undef WRITE_WSTR
5670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671/* Return a Unicode-Escape string version of the Unicode object.
5672
5673 If quotes is true, the string is enclosed in u"" or u'' quotes as
5674 appropriate.
5675
5676*/
5677
Walter Dörwald79e913e2007-05-12 11:08:06 +00005678static const char *hexdigits = "0123456789abcdef";
5679
Alexander Belopolsky40018472011-02-26 01:02:56 +00005680PyObject *
5681PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005682 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005684 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005687#ifdef Py_UNICODE_WIDE
5688 const Py_ssize_t expandsize = 10;
5689#else
5690 const Py_ssize_t expandsize = 6;
5691#endif
5692
Thomas Wouters89f507f2006-12-13 04:49:30 +00005693 /* XXX(nnorwitz): rather than over-allocating, it would be
5694 better to choose a different scheme. Perhaps scan the
5695 first N-chars of the string and allocate based on that size.
5696 */
5697 /* Initial allocation is based on the longest-possible unichr
5698 escape.
5699
5700 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5701 unichr, so in this case it's the longest unichr escape. In
5702 narrow (UTF-16) builds this is five chars per source unichr
5703 since there are two unichrs in the surrogate pair, so in narrow
5704 (UTF-16) builds it's not the longest unichr escape.
5705
5706 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5707 so in the narrow (UTF-16) build case it's the longest unichr
5708 escape.
5709 */
5710
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005711 if (size == 0)
5712 return PyBytes_FromStringAndSize(NULL, 0);
5713
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005714 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005716
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005717 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 2
5719 + expandsize*size
5720 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 if (repr == NULL)
5722 return NULL;
5723
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 while (size-- > 0) {
5727 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005728
Walter Dörwald79e913e2007-05-12 11:08:06 +00005729 /* Escape backslashes */
5730 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 *p++ = '\\';
5732 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005733 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005734 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005735
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005736#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005737 /* Map 21-bit characters to '\U00xxxxxx' */
5738 else if (ch >= 0x10000) {
5739 *p++ = '\\';
5740 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005741 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5742 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5743 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5744 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5745 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5746 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5747 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5748 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005750 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005751#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5753 else if (ch >= 0xD800 && ch < 0xDC00) {
5754 Py_UNICODE ch2;
5755 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005756
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 ch2 = *s++;
5758 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005759 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5761 *p++ = '\\';
5762 *p++ = 'U';
5763 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5764 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5765 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5766 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5767 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5768 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5769 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5770 *p++ = hexdigits[ucs & 0x0000000F];
5771 continue;
5772 }
5773 /* Fall through: isolated surrogates are copied as-is */
5774 s--;
5775 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005776 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005777#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005778
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005780 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 *p++ = '\\';
5782 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005783 *p++ = hexdigits[(ch >> 12) & 0x000F];
5784 *p++ = hexdigits[(ch >> 8) & 0x000F];
5785 *p++ = hexdigits[(ch >> 4) & 0x000F];
5786 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005788
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005789 /* Map special whitespace to '\t', \n', '\r' */
5790 else if (ch == '\t') {
5791 *p++ = '\\';
5792 *p++ = 't';
5793 }
5794 else if (ch == '\n') {
5795 *p++ = '\\';
5796 *p++ = 'n';
5797 }
5798 else if (ch == '\r') {
5799 *p++ = '\\';
5800 *p++ = 'r';
5801 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005802
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005803 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005804 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005806 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005807 *p++ = hexdigits[(ch >> 4) & 0x000F];
5808 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 /* Copy everything else as-is */
5812 else
5813 *p++ = (char) ch;
5814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 assert(p - PyBytes_AS_STRING(repr) > 0);
5817 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5818 return NULL;
5819 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820}
5821
Alexander Belopolsky40018472011-02-26 01:02:56 +00005822PyObject *
5823PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005825 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 if (!PyUnicode_Check(unicode)) {
5827 PyErr_BadArgument();
5828 return NULL;
5829 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005830 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5831 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005832 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}
5834
5835/* --- Raw Unicode Escape Codec ------------------------------------------- */
5836
Alexander Belopolsky40018472011-02-26 01:02:56 +00005837PyObject *
5838PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005839 Py_ssize_t size,
5840 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005843 Py_ssize_t startinpos;
5844 Py_ssize_t endinpos;
5845 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 const char *end;
5849 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 PyObject *errorHandler = NULL;
5851 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 /* Escaped strings will always be longer than the resulting
5854 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 length after conversion to the true value. (But decoding error
5856 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 v = _PyUnicode_New(size);
5858 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 end = s + size;
5864 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 unsigned char c;
5866 Py_UCS4 x;
5867 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005868 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 /* Non-escape characters are interpreted as Unicode ordinals */
5871 if (*s != '\\') {
5872 *p++ = (unsigned char)*s++;
5873 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 startinpos = s-starts;
5876
5877 /* \u-escapes are only interpreted iff the number of leading
5878 backslashes if odd */
5879 bs = s;
5880 for (;s < end;) {
5881 if (*s != '\\')
5882 break;
5883 *p++ = (unsigned char)*s++;
5884 }
5885 if (((s - bs) & 1) == 0 ||
5886 s >= end ||
5887 (*s != 'u' && *s != 'U')) {
5888 continue;
5889 }
5890 p--;
5891 count = *s=='u' ? 4 : 8;
5892 s++;
5893
5894 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5895 outpos = p-PyUnicode_AS_UNICODE(v);
5896 for (x = 0, i = 0; i < count; ++i, ++s) {
5897 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005898 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 endinpos = s-starts;
5900 if (unicode_decode_call_errorhandler(
5901 errors, &errorHandler,
5902 "rawunicodeescape", "truncated \\uXXXX",
5903 &starts, &end, &startinpos, &endinpos, &exc, &s,
5904 &v, &outpos, &p))
5905 goto onError;
5906 goto nextByte;
5907 }
5908 x = (x<<4) & ~0xF;
5909 if (c >= '0' && c <= '9')
5910 x += c - '0';
5911 else if (c >= 'a' && c <= 'f')
5912 x += 10 + c - 'a';
5913 else
5914 x += 10 + c - 'A';
5915 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005916 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* UCS-2 character */
5918 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005919 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 /* UCS-4 character. Either store directly, or as
5921 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005922#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005924#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 x -= 0x10000L;
5926 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5927 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005928#endif
5929 } else {
5930 endinpos = s-starts;
5931 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 if (unicode_decode_call_errorhandler(
5933 errors, &errorHandler,
5934 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 &starts, &end, &startinpos, &endinpos, &exc, &s,
5936 &v, &outpos, &p))
5937 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 nextByte:
5940 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005942 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005946#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005947 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948 Py_DECREF(v);
5949 return NULL;
5950 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005951#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005953
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 Py_XDECREF(errorHandler);
5957 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 return NULL;
5959}
5960
Alexander Belopolsky40018472011-02-26 01:02:56 +00005961PyObject *
5962PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005963 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005965 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 char *p;
5967 char *q;
5968
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005969#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005970 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005971#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005972 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005973#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005974
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005975 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 if (repr == NULL)
5980 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005981 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 while (size-- > 0) {
5986 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005987#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* Map 32-bit characters to '\Uxxxxxxxx' */
5989 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005990 *p++ = '\\';
5991 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005992 *p++ = hexdigits[(ch >> 28) & 0xf];
5993 *p++ = hexdigits[(ch >> 24) & 0xf];
5994 *p++ = hexdigits[(ch >> 20) & 0xf];
5995 *p++ = hexdigits[(ch >> 16) & 0xf];
5996 *p++ = hexdigits[(ch >> 12) & 0xf];
5997 *p++ = hexdigits[(ch >> 8) & 0xf];
5998 *p++ = hexdigits[(ch >> 4) & 0xf];
5999 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006000 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006001 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006002#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6004 if (ch >= 0xD800 && ch < 0xDC00) {
6005 Py_UNICODE ch2;
6006 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 ch2 = *s++;
6009 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006010 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6012 *p++ = '\\';
6013 *p++ = 'U';
6014 *p++ = hexdigits[(ucs >> 28) & 0xf];
6015 *p++ = hexdigits[(ucs >> 24) & 0xf];
6016 *p++ = hexdigits[(ucs >> 20) & 0xf];
6017 *p++ = hexdigits[(ucs >> 16) & 0xf];
6018 *p++ = hexdigits[(ucs >> 12) & 0xf];
6019 *p++ = hexdigits[(ucs >> 8) & 0xf];
6020 *p++ = hexdigits[(ucs >> 4) & 0xf];
6021 *p++ = hexdigits[ucs & 0xf];
6022 continue;
6023 }
6024 /* Fall through: isolated surrogates are copied as-is */
6025 s--;
6026 size++;
6027 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 /* Map 16-bit characters to '\uxxxx' */
6030 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 *p++ = '\\';
6032 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006033 *p++ = hexdigits[(ch >> 12) & 0xf];
6034 *p++ = hexdigits[(ch >> 8) & 0xf];
6035 *p++ = hexdigits[(ch >> 4) & 0xf];
6036 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* Copy everything else as-is */
6039 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 *p++ = (char) ch;
6041 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006042 size = p - q;
6043
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006044 assert(size > 0);
6045 if (_PyBytes_Resize(&repr, size) < 0)
6046 return NULL;
6047 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048}
6049
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050PyObject *
6051PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006053 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006055 PyErr_BadArgument();
6056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006058 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6059 PyUnicode_GET_SIZE(unicode));
6060
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006061 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062}
6063
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006064/* --- Unicode Internal Codec ------------------------------------------- */
6065
Alexander Belopolsky40018472011-02-26 01:02:56 +00006066PyObject *
6067_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006068 Py_ssize_t size,
6069 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006070{
6071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t startinpos;
6073 Py_ssize_t endinpos;
6074 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006075 PyUnicodeObject *v;
6076 Py_UNICODE *p;
6077 const char *end;
6078 const char *reason;
6079 PyObject *errorHandler = NULL;
6080 PyObject *exc = NULL;
6081
Neal Norwitzd43069c2006-01-08 01:12:10 +00006082#ifdef Py_UNICODE_WIDE
6083 Py_UNICODE unimax = PyUnicode_GetMax();
6084#endif
6085
Thomas Wouters89f507f2006-12-13 04:49:30 +00006086 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006087 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6088 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6091 as string was created with the old API. */
6092 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006094 p = PyUnicode_AS_UNICODE(v);
6095 end = s + size;
6096
6097 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006098 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006099 /* We have to sanity check the raw data, otherwise doom looms for
6100 some malformed UCS-4 data. */
6101 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006102#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006103 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006104#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 end-s < Py_UNICODE_SIZE
6106 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006108 startinpos = s - starts;
6109 if (end-s < Py_UNICODE_SIZE) {
6110 endinpos = end-starts;
6111 reason = "truncated input";
6112 }
6113 else {
6114 endinpos = s - starts + Py_UNICODE_SIZE;
6115 reason = "illegal code point (> 0x10FFFF)";
6116 }
6117 outpos = p - PyUnicode_AS_UNICODE(v);
6118 if (unicode_decode_call_errorhandler(
6119 errors, &errorHandler,
6120 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006121 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006122 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 goto onError;
6124 }
6125 }
6126 else {
6127 p++;
6128 s += Py_UNICODE_SIZE;
6129 }
6130 }
6131
Victor Stinnerfe226c02011-10-03 03:52:20 +02006132 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006133 goto onError;
6134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006136#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006137 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006138 Py_DECREF(v);
6139 return NULL;
6140 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006141#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006142 return (PyObject *)v;
6143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006145 Py_XDECREF(v);
6146 Py_XDECREF(errorHandler);
6147 Py_XDECREF(exc);
6148 return NULL;
6149}
6150
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151/* --- Latin-1 Codec ------------------------------------------------------ */
6152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
6154PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006155 Py_ssize_t size,
6156 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006159 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160}
6161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163static void
6164make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006165 const char *encoding,
6166 const Py_UNICODE *unicode, Py_ssize_t size,
6167 Py_ssize_t startpos, Py_ssize_t endpos,
6168 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 *exceptionObject = PyUnicodeEncodeError_Create(
6172 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
6174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6176 goto onError;
6177 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6178 goto onError;
6179 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6180 goto onError;
6181 return;
6182 onError:
6183 Py_DECREF(*exceptionObject);
6184 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 }
6186}
6187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006189static void
6190raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006191 const char *encoding,
6192 const Py_UNICODE *unicode, Py_ssize_t size,
6193 Py_ssize_t startpos, Py_ssize_t endpos,
6194 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195{
6196 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200}
6201
6202/* error handling callback helper:
6203 build arguments, call the callback and check the arguments,
6204 put the result into newpos and return the replacement string, which
6205 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006206static PyObject *
6207unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006208 PyObject **errorHandler,
6209 const char *encoding, const char *reason,
6210 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6211 Py_ssize_t startpos, Py_ssize_t endpos,
6212 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006214 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215
6216 PyObject *restuple;
6217 PyObject *resunicode;
6218
6219 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 }
6224
6225 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229
6230 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006232 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006235 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 Py_DECREF(restuple);
6237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006239 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 &resunicode, newpos)) {
6241 Py_DECREF(restuple);
6242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006244 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6245 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6246 Py_DECREF(restuple);
6247 return NULL;
6248 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006251 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6253 Py_DECREF(restuple);
6254 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 Py_INCREF(resunicode);
6257 Py_DECREF(restuple);
6258 return resunicode;
6259}
6260
Alexander Belopolsky40018472011-02-26 01:02:56 +00006261static PyObject *
6262unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006263 Py_ssize_t size,
6264 const char *errors,
6265 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266{
6267 /* output object */
6268 PyObject *res;
6269 /* pointers to the beginning and end+1 of input */
6270 const Py_UNICODE *startp = p;
6271 const Py_UNICODE *endp = p + size;
6272 /* pointer to the beginning of the unencodable characters */
6273 /* const Py_UNICODE *badp = NULL; */
6274 /* pointer into the output */
6275 char *str;
6276 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006278 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6279 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006280 PyObject *errorHandler = NULL;
6281 PyObject *exc = NULL;
6282 /* the following variable is used for caching string comparisons
6283 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6284 int known_errorHandler = -1;
6285
6286 /* allocate enough for a simple encoding without
6287 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006288 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006289 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006290 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006292 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006293 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 ressize = size;
6295
6296 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 /* can we encode this? */
6300 if (c<limit) {
6301 /* no overflow check, because we know that the space is enough */
6302 *str++ = (char)c;
6303 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 else {
6306 Py_ssize_t unicodepos = p-startp;
6307 Py_ssize_t requiredsize;
6308 PyObject *repunicode;
6309 Py_ssize_t repsize;
6310 Py_ssize_t newpos;
6311 Py_ssize_t respos;
6312 Py_UNICODE *uni2;
6313 /* startpos for collecting unencodable chars */
6314 const Py_UNICODE *collstart = p;
6315 const Py_UNICODE *collend = p;
6316 /* find all unecodable characters */
6317 while ((collend < endp) && ((*collend)>=limit))
6318 ++collend;
6319 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6320 if (known_errorHandler==-1) {
6321 if ((errors==NULL) || (!strcmp(errors, "strict")))
6322 known_errorHandler = 1;
6323 else if (!strcmp(errors, "replace"))
6324 known_errorHandler = 2;
6325 else if (!strcmp(errors, "ignore"))
6326 known_errorHandler = 3;
6327 else if (!strcmp(errors, "xmlcharrefreplace"))
6328 known_errorHandler = 4;
6329 else
6330 known_errorHandler = 0;
6331 }
6332 switch (known_errorHandler) {
6333 case 1: /* strict */
6334 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6335 goto onError;
6336 case 2: /* replace */
6337 while (collstart++<collend)
6338 *str++ = '?'; /* fall through */
6339 case 3: /* ignore */
6340 p = collend;
6341 break;
6342 case 4: /* xmlcharrefreplace */
6343 respos = str - PyBytes_AS_STRING(res);
6344 /* determine replacement size (temporarily (mis)uses p) */
6345 for (p = collstart, repsize = 0; p < collend; ++p) {
6346 if (*p<10)
6347 repsize += 2+1+1;
6348 else if (*p<100)
6349 repsize += 2+2+1;
6350 else if (*p<1000)
6351 repsize += 2+3+1;
6352 else if (*p<10000)
6353 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006354#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 else
6356 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006357#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 else if (*p<100000)
6359 repsize += 2+5+1;
6360 else if (*p<1000000)
6361 repsize += 2+6+1;
6362 else
6363 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006364#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 }
6366 requiredsize = respos+repsize+(endp-collend);
6367 if (requiredsize > ressize) {
6368 if (requiredsize<2*ressize)
6369 requiredsize = 2*ressize;
6370 if (_PyBytes_Resize(&res, requiredsize))
6371 goto onError;
6372 str = PyBytes_AS_STRING(res) + respos;
6373 ressize = requiredsize;
6374 }
6375 /* generate replacement (temporarily (mis)uses p) */
6376 for (p = collstart; p < collend; ++p) {
6377 str += sprintf(str, "&#%d;", (int)*p);
6378 }
6379 p = collend;
6380 break;
6381 default:
6382 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6383 encoding, reason, startp, size, &exc,
6384 collstart-startp, collend-startp, &newpos);
6385 if (repunicode == NULL)
6386 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006387 if (PyBytes_Check(repunicode)) {
6388 /* Directly copy bytes result to output. */
6389 repsize = PyBytes_Size(repunicode);
6390 if (repsize > 1) {
6391 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006392 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006393 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6394 Py_DECREF(repunicode);
6395 goto onError;
6396 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006397 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006398 ressize += repsize-1;
6399 }
6400 memcpy(str, PyBytes_AsString(repunicode), repsize);
6401 str += repsize;
6402 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006403 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006404 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 /* need more space? (at least enough for what we
6407 have+the replacement+the rest of the string, so
6408 we won't have to check space for encodable characters) */
6409 respos = str - PyBytes_AS_STRING(res);
6410 repsize = PyUnicode_GET_SIZE(repunicode);
6411 requiredsize = respos+repsize+(endp-collend);
6412 if (requiredsize > ressize) {
6413 if (requiredsize<2*ressize)
6414 requiredsize = 2*ressize;
6415 if (_PyBytes_Resize(&res, requiredsize)) {
6416 Py_DECREF(repunicode);
6417 goto onError;
6418 }
6419 str = PyBytes_AS_STRING(res) + respos;
6420 ressize = requiredsize;
6421 }
6422 /* check if there is anything unencodable in the replacement
6423 and copy it to the output */
6424 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6425 c = *uni2;
6426 if (c >= limit) {
6427 raise_encode_exception(&exc, encoding, startp, size,
6428 unicodepos, unicodepos+1, reason);
6429 Py_DECREF(repunicode);
6430 goto onError;
6431 }
6432 *str = (char)c;
6433 }
6434 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006435 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006437 }
6438 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006439 /* Resize if we allocated to much */
6440 size = str - PyBytes_AS_STRING(res);
6441 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006442 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006443 if (_PyBytes_Resize(&res, size) < 0)
6444 goto onError;
6445 }
6446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 Py_XDECREF(errorHandler);
6448 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006449 return res;
6450
6451 onError:
6452 Py_XDECREF(res);
6453 Py_XDECREF(errorHandler);
6454 Py_XDECREF(exc);
6455 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456}
6457
Alexander Belopolsky40018472011-02-26 01:02:56 +00006458PyObject *
6459PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006460 Py_ssize_t size,
6461 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Alexander Belopolsky40018472011-02-26 01:02:56 +00006466PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006467_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468{
6469 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 PyErr_BadArgument();
6471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006473 if (PyUnicode_READY(unicode) == -1)
6474 return NULL;
6475 /* Fast path: if it is a one-byte string, construct
6476 bytes object directly. */
6477 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6478 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6479 PyUnicode_GET_LENGTH(unicode));
6480 /* Non-Latin-1 characters present. Defer to above function to
6481 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006484 errors);
6485}
6486
6487PyObject*
6488PyUnicode_AsLatin1String(PyObject *unicode)
6489{
6490 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491}
6492
6493/* --- 7-bit ASCII Codec -------------------------------------------------- */
6494
Alexander Belopolsky40018472011-02-26 01:02:56 +00006495PyObject *
6496PyUnicode_DecodeASCII(const char *s,
6497 Py_ssize_t size,
6498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006502 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t startinpos;
6504 Py_ssize_t endinpos;
6505 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006507 int has_error;
6508 const unsigned char *p = (const unsigned char *)s;
6509 const unsigned char *end = p + size;
6510 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 PyObject *errorHandler = NULL;
6512 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006513
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006515 if (size == 1 && (unsigned char)s[0] < 128)
6516 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517
Victor Stinner702c7342011-10-05 13:50:52 +02006518 has_error = 0;
6519 while (p < end && !has_error) {
6520 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6521 an explanation. */
6522 if (!((size_t) p & LONG_PTR_MASK)) {
6523 /* Help register allocation */
6524 register const unsigned char *_p = p;
6525 while (_p < aligned_end) {
6526 unsigned long value = *(unsigned long *) _p;
6527 if (value & ASCII_CHAR_MASK) {
6528 has_error = 1;
6529 break;
6530 }
6531 _p += SIZEOF_LONG;
6532 }
6533 if (_p == end)
6534 break;
6535 if (has_error)
6536 break;
6537 p = _p;
6538 }
6539 if (*p & 0x80) {
6540 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006541 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006542 }
6543 else {
6544 ++p;
6545 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006546 }
Victor Stinner702c7342011-10-05 13:50:52 +02006547 if (!has_error)
6548 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 v = _PyUnicode_New(size);
6551 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006555 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 e = s + size;
6557 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 register unsigned char c = (unsigned char)*s;
6559 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006560 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 ++s;
6562 }
6563 else {
6564 startinpos = s-starts;
6565 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006566 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (unicode_decode_call_errorhandler(
6568 errors, &errorHandler,
6569 "ascii", "ordinal not in range(128)",
6570 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006571 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 goto onError;
6573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 }
Victor Stinner702c7342011-10-05 13:50:52 +02006575 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6576 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 Py_XDECREF(errorHandler);
6579 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006580#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006581 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006582 Py_DECREF(v);
6583 return NULL;
6584 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006585#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 Py_XDECREF(errorHandler);
6591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
6593}
6594
Alexander Belopolsky40018472011-02-26 01:02:56 +00006595PyObject *
6596PyUnicode_EncodeASCII(const Py_UNICODE *p,
6597 Py_ssize_t size,
6598 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601}
6602
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006604_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
6606 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 PyErr_BadArgument();
6608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006610 if (PyUnicode_READY(unicode) == -1)
6611 return NULL;
6612 /* Fast path: if it is an ASCII-only string, construct bytes object
6613 directly. Else defer to above function to raise the exception. */
6614 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6615 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6616 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006619 errors);
6620}
6621
6622PyObject *
6623PyUnicode_AsASCIIString(PyObject *unicode)
6624{
6625 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
Victor Stinner99b95382011-07-04 14:23:54 +02006628#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006629
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006630/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006631
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006632#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006633#define NEED_RETRY
6634#endif
6635
6636/* XXX This code is limited to "true" double-byte encodings, as
6637 a) it assumes an incomplete character consists of a single byte, and
6638 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006640
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static int
6642is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006643{
6644 const char *curr = s + offset;
6645
6646 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 const char *prev = CharPrev(s, curr);
6648 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006649 }
6650 return 0;
6651}
6652
6653/*
6654 * Decode MBCS string into unicode object. If 'final' is set, converts
6655 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6656 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657static int
6658decode_mbcs(PyUnicodeObject **v,
6659 const char *s, /* MBCS string */
6660 int size, /* sizeof MBCS string */
6661 int final,
6662 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663{
6664 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006665 Py_ssize_t n;
6666 DWORD usize;
6667 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006668
6669 assert(size >= 0);
6670
Victor Stinner554f3f02010-06-16 23:33:54 +00006671 /* check and handle 'errors' arg */
6672 if (errors==NULL || strcmp(errors, "strict")==0)
6673 flags = MB_ERR_INVALID_CHARS;
6674 else if (strcmp(errors, "ignore")==0)
6675 flags = 0;
6676 else {
6677 PyErr_Format(PyExc_ValueError,
6678 "mbcs encoding does not support errors='%s'",
6679 errors);
6680 return -1;
6681 }
6682
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683 /* Skip trailing lead-byte unless 'final' is set */
6684 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686
6687 /* First get the size of the result */
6688 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006689 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6690 if (usize==0)
6691 goto mbcs_decode_error;
6692 } else
6693 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694
6695 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 /* Create unicode object */
6697 *v = _PyUnicode_New(usize);
6698 if (*v == NULL)
6699 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006700 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 }
6702 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 /* Extend unicode object */
6704 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006705 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707 }
6708
6709 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006710 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006712 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6713 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006716 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006717
6718mbcs_decode_error:
6719 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6720 we raise a UnicodeDecodeError - else it is a 'generic'
6721 windows error
6722 */
6723 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6724 /* Ideally, we should get reason from FormatMessage - this
6725 is the Windows 2000 English version of the message
6726 */
6727 PyObject *exc = NULL;
6728 const char *reason = "No mapping for the Unicode character exists "
6729 "in the target multi-byte code page.";
6730 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6731 if (exc != NULL) {
6732 PyCodec_StrictErrors(exc);
6733 Py_DECREF(exc);
6734 }
6735 } else {
6736 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6737 }
6738 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006739}
6740
Alexander Belopolsky40018472011-02-26 01:02:56 +00006741PyObject *
6742PyUnicode_DecodeMBCSStateful(const char *s,
6743 Py_ssize_t size,
6744 const char *errors,
6745 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006746{
6747 PyUnicodeObject *v = NULL;
6748 int done;
6749
6750 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752
6753#ifdef NEED_RETRY
6754 retry:
6755 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006756 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757 else
6758#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006759 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760
6761 if (done < 0) {
6762 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764 }
6765
6766 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768
6769#ifdef NEED_RETRY
6770 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 s += done;
6772 size -= done;
6773 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
6775#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006776#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006777 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778 Py_DECREF(v);
6779 return NULL;
6780 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006781#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782 return (PyObject *)v;
6783}
6784
Alexander Belopolsky40018472011-02-26 01:02:56 +00006785PyObject *
6786PyUnicode_DecodeMBCS(const char *s,
6787 Py_ssize_t size,
6788 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006789{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6791}
6792
6793/*
6794 * Convert unicode into string object (MBCS).
6795 * Returns 0 if succeed, -1 otherwise.
6796 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006797static int
6798encode_mbcs(PyObject **repr,
6799 const Py_UNICODE *p, /* unicode */
6800 int size, /* size of unicode */
6801 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802{
Victor Stinner554f3f02010-06-16 23:33:54 +00006803 BOOL usedDefaultChar = FALSE;
6804 BOOL *pusedDefaultChar;
6805 int mbcssize;
6806 Py_ssize_t n;
6807 PyObject *exc = NULL;
6808 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809
6810 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006811
Victor Stinner554f3f02010-06-16 23:33:54 +00006812 /* check and handle 'errors' arg */
6813 if (errors==NULL || strcmp(errors, "strict")==0) {
6814 flags = WC_NO_BEST_FIT_CHARS;
6815 pusedDefaultChar = &usedDefaultChar;
6816 } else if (strcmp(errors, "replace")==0) {
6817 flags = 0;
6818 pusedDefaultChar = NULL;
6819 } else {
6820 PyErr_Format(PyExc_ValueError,
6821 "mbcs encoding does not support errors='%s'",
6822 errors);
6823 return -1;
6824 }
6825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006826 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006828 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6829 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 if (mbcssize == 0) {
6831 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6832 return -1;
6833 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006834 /* If we used a default char, then we failed! */
6835 if (pusedDefaultChar && *pusedDefaultChar)
6836 goto mbcs_encode_error;
6837 } else {
6838 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006839 }
6840
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 /* Create string object */
6843 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6844 if (*repr == NULL)
6845 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006846 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847 }
6848 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 /* Extend string object */
6850 n = PyBytes_Size(*repr);
6851 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6852 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853 }
6854
6855 /* Do the conversion */
6856 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006858 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6859 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6861 return -1;
6862 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006863 if (pusedDefaultChar && *pusedDefaultChar)
6864 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006867
6868mbcs_encode_error:
6869 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6870 Py_XDECREF(exc);
6871 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006872}
6873
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874PyObject *
6875PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6876 Py_ssize_t size,
6877 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006878{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879 PyObject *repr = NULL;
6880 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 else
6887#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006888 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006889
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 Py_XDECREF(repr);
6892 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006893 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894
6895#ifdef NEED_RETRY
6896 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 p += INT_MAX;
6898 size -= INT_MAX;
6899 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 }
6901#endif
6902
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006903 return repr;
6904}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006905
Alexander Belopolsky40018472011-02-26 01:02:56 +00006906PyObject *
6907PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006908{
6909 if (!PyUnicode_Check(unicode)) {
6910 PyErr_BadArgument();
6911 return NULL;
6912 }
6913 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyUnicode_GET_SIZE(unicode),
6915 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006916}
6917
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918#undef NEED_RETRY
6919
Victor Stinner99b95382011-07-04 14:23:54 +02006920#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006921
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922/* --- Character Mapping Codec -------------------------------------------- */
6923
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_DecodeCharmap(const char *s,
6926 Py_ssize_t size,
6927 PyObject *mapping,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t startinpos;
6932 Py_ssize_t endinpos;
6933 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 PyUnicodeObject *v;
6936 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006937 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 PyObject *errorHandler = NULL;
6939 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006940 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006941 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 /* Default to Latin-1 */
6944 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947 v = _PyUnicode_New(size);
6948 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006954 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 mapstring = PyUnicode_AS_UNICODE(mapping);
6956 maplen = PyUnicode_GET_SIZE(mapping);
6957 while (s < e) {
6958 unsigned char ch = *s;
6959 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 if (ch < maplen)
6962 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 if (x == 0xfffe) {
6965 /* undefined mapping */
6966 outpos = p-PyUnicode_AS_UNICODE(v);
6967 startinpos = s-starts;
6968 endinpos = startinpos+1;
6969 if (unicode_decode_call_errorhandler(
6970 errors, &errorHandler,
6971 "charmap", "character maps to <undefined>",
6972 &starts, &e, &startinpos, &endinpos, &exc, &s,
6973 &v, &outpos, &p)) {
6974 goto onError;
6975 }
6976 continue;
6977 }
6978 *p++ = x;
6979 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006980 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006981 }
6982 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 while (s < e) {
6984 unsigned char ch = *s;
6985 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006986
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6988 w = PyLong_FromLong((long)ch);
6989 if (w == NULL)
6990 goto onError;
6991 x = PyObject_GetItem(mapping, w);
6992 Py_DECREF(w);
6993 if (x == NULL) {
6994 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6995 /* No mapping found means: mapping is undefined. */
6996 PyErr_Clear();
6997 x = Py_None;
6998 Py_INCREF(x);
6999 } else
7000 goto onError;
7001 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007002
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 /* Apply mapping */
7004 if (PyLong_Check(x)) {
7005 long value = PyLong_AS_LONG(x);
7006 if (value < 0 || value > 65535) {
7007 PyErr_SetString(PyExc_TypeError,
7008 "character mapping must be in range(65536)");
7009 Py_DECREF(x);
7010 goto onError;
7011 }
7012 *p++ = (Py_UNICODE)value;
7013 }
7014 else if (x == Py_None) {
7015 /* undefined mapping */
7016 outpos = p-PyUnicode_AS_UNICODE(v);
7017 startinpos = s-starts;
7018 endinpos = startinpos+1;
7019 if (unicode_decode_call_errorhandler(
7020 errors, &errorHandler,
7021 "charmap", "character maps to <undefined>",
7022 &starts, &e, &startinpos, &endinpos, &exc, &s,
7023 &v, &outpos, &p)) {
7024 Py_DECREF(x);
7025 goto onError;
7026 }
7027 Py_DECREF(x);
7028 continue;
7029 }
7030 else if (PyUnicode_Check(x)) {
7031 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 if (targetsize == 1)
7034 /* 1-1 mapping */
7035 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 else if (targetsize > 1) {
7038 /* 1-n mapping */
7039 if (targetsize > extrachars) {
7040 /* resize first */
7041 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7042 Py_ssize_t needed = (targetsize - extrachars) + \
7043 (targetsize << 2);
7044 extrachars += needed;
7045 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007046 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyUnicode_GET_SIZE(v) + needed) < 0) {
7048 Py_DECREF(x);
7049 goto onError;
7050 }
7051 p = PyUnicode_AS_UNICODE(v) + oldpos;
7052 }
7053 Py_UNICODE_COPY(p,
7054 PyUnicode_AS_UNICODE(x),
7055 targetsize);
7056 p += targetsize;
7057 extrachars -= targetsize;
7058 }
7059 /* 1-0 mapping: skip the character */
7060 }
7061 else {
7062 /* wrong return value */
7063 PyErr_SetString(PyExc_TypeError,
7064 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007065 Py_DECREF(x);
7066 goto onError;
7067 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 Py_DECREF(x);
7069 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 }
7072 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007073 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007075 Py_XDECREF(errorHandler);
7076 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007077#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007078 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 Py_DECREF(v);
7080 return NULL;
7081 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007082#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007084
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 Py_XDECREF(errorHandler);
7087 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 Py_XDECREF(v);
7089 return NULL;
7090}
7091
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007092/* Charmap encoding: the lookup table */
7093
Alexander Belopolsky40018472011-02-26 01:02:56 +00007094struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 PyObject_HEAD
7096 unsigned char level1[32];
7097 int count2, count3;
7098 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099};
7100
7101static PyObject*
7102encoding_map_size(PyObject *obj, PyObject* args)
7103{
7104 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007105 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007107}
7108
7109static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007110 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 PyDoc_STR("Return the size (in bytes) of this object") },
7112 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007113};
7114
7115static void
7116encoding_map_dealloc(PyObject* o)
7117{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007118 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007119}
7120
7121static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007122 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 "EncodingMap", /*tp_name*/
7124 sizeof(struct encoding_map), /*tp_basicsize*/
7125 0, /*tp_itemsize*/
7126 /* methods */
7127 encoding_map_dealloc, /*tp_dealloc*/
7128 0, /*tp_print*/
7129 0, /*tp_getattr*/
7130 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007131 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 0, /*tp_repr*/
7133 0, /*tp_as_number*/
7134 0, /*tp_as_sequence*/
7135 0, /*tp_as_mapping*/
7136 0, /*tp_hash*/
7137 0, /*tp_call*/
7138 0, /*tp_str*/
7139 0, /*tp_getattro*/
7140 0, /*tp_setattro*/
7141 0, /*tp_as_buffer*/
7142 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7143 0, /*tp_doc*/
7144 0, /*tp_traverse*/
7145 0, /*tp_clear*/
7146 0, /*tp_richcompare*/
7147 0, /*tp_weaklistoffset*/
7148 0, /*tp_iter*/
7149 0, /*tp_iternext*/
7150 encoding_map_methods, /*tp_methods*/
7151 0, /*tp_members*/
7152 0, /*tp_getset*/
7153 0, /*tp_base*/
7154 0, /*tp_dict*/
7155 0, /*tp_descr_get*/
7156 0, /*tp_descr_set*/
7157 0, /*tp_dictoffset*/
7158 0, /*tp_init*/
7159 0, /*tp_alloc*/
7160 0, /*tp_new*/
7161 0, /*tp_free*/
7162 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007163};
7164
7165PyObject*
7166PyUnicode_BuildEncodingMap(PyObject* string)
7167{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168 PyObject *result;
7169 struct encoding_map *mresult;
7170 int i;
7171 int need_dict = 0;
7172 unsigned char level1[32];
7173 unsigned char level2[512];
7174 unsigned char *mlevel1, *mlevel2, *mlevel3;
7175 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007176 int kind;
7177 void *data;
7178 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007180 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007181 PyErr_BadArgument();
7182 return NULL;
7183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007184 kind = PyUnicode_KIND(string);
7185 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007186 memset(level1, 0xFF, sizeof level1);
7187 memset(level2, 0xFF, sizeof level2);
7188
7189 /* If there isn't a one-to-one mapping of NULL to \0,
7190 or if there are non-BMP characters, we need to use
7191 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007193 need_dict = 1;
7194 for (i = 1; i < 256; i++) {
7195 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 ch = PyUnicode_READ(kind, data, i);
7197 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007198 need_dict = 1;
7199 break;
7200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007201 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 /* unmapped character */
7203 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007204 l1 = ch >> 11;
7205 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007206 if (level1[l1] == 0xFF)
7207 level1[l1] = count2++;
7208 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007209 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007210 }
7211
7212 if (count2 >= 0xFF || count3 >= 0xFF)
7213 need_dict = 1;
7214
7215 if (need_dict) {
7216 PyObject *result = PyDict_New();
7217 PyObject *key, *value;
7218 if (!result)
7219 return NULL;
7220 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007222 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223 if (!key || !value)
7224 goto failed1;
7225 if (PyDict_SetItem(result, key, value) == -1)
7226 goto failed1;
7227 Py_DECREF(key);
7228 Py_DECREF(value);
7229 }
7230 return result;
7231 failed1:
7232 Py_XDECREF(key);
7233 Py_XDECREF(value);
7234 Py_DECREF(result);
7235 return NULL;
7236 }
7237
7238 /* Create a three-level trie */
7239 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7240 16*count2 + 128*count3 - 1);
7241 if (!result)
7242 return PyErr_NoMemory();
7243 PyObject_Init(result, &EncodingMapType);
7244 mresult = (struct encoding_map*)result;
7245 mresult->count2 = count2;
7246 mresult->count3 = count3;
7247 mlevel1 = mresult->level1;
7248 mlevel2 = mresult->level23;
7249 mlevel3 = mresult->level23 + 16*count2;
7250 memcpy(mlevel1, level1, 32);
7251 memset(mlevel2, 0xFF, 16*count2);
7252 memset(mlevel3, 0, 128*count3);
7253 count3 = 0;
7254 for (i = 1; i < 256; i++) {
7255 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007256 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007257 /* unmapped character */
7258 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259 o1 = PyUnicode_READ(kind, data, i)>>11;
7260 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007261 i2 = 16*mlevel1[o1] + o2;
7262 if (mlevel2[i2] == 0xFF)
7263 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007265 i3 = 128*mlevel2[i2] + o3;
7266 mlevel3[i3] = i;
7267 }
7268 return result;
7269}
7270
7271static int
7272encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7273{
7274 struct encoding_map *map = (struct encoding_map*)mapping;
7275 int l1 = c>>11;
7276 int l2 = (c>>7) & 0xF;
7277 int l3 = c & 0x7F;
7278 int i;
7279
7280#ifdef Py_UNICODE_WIDE
7281 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007283 }
7284#endif
7285 if (c == 0)
7286 return 0;
7287 /* level 1*/
7288 i = map->level1[l1];
7289 if (i == 0xFF) {
7290 return -1;
7291 }
7292 /* level 2*/
7293 i = map->level23[16*i+l2];
7294 if (i == 0xFF) {
7295 return -1;
7296 }
7297 /* level 3 */
7298 i = map->level23[16*map->count2 + 128*i + l3];
7299 if (i == 0) {
7300 return -1;
7301 }
7302 return i;
7303}
7304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305/* Lookup the character ch in the mapping. If the character
7306 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007307 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007308static PyObject *
7309charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310{
Christian Heimes217cfd12007-12-02 14:31:20 +00007311 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 PyObject *x;
7313
7314 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316 x = PyObject_GetItem(mapping, w);
7317 Py_DECREF(w);
7318 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7320 /* No mapping found means: mapping is undefined. */
7321 PyErr_Clear();
7322 x = Py_None;
7323 Py_INCREF(x);
7324 return x;
7325 } else
7326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007328 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007330 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 long value = PyLong_AS_LONG(x);
7332 if (value < 0 || value > 255) {
7333 PyErr_SetString(PyExc_TypeError,
7334 "character mapping must be in range(256)");
7335 Py_DECREF(x);
7336 return NULL;
7337 }
7338 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007340 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 /* wrong return value */
7344 PyErr_Format(PyExc_TypeError,
7345 "character mapping must return integer, bytes or None, not %.400s",
7346 x->ob_type->tp_name);
7347 Py_DECREF(x);
7348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 }
7350}
7351
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007352static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007353charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007354{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007355 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7356 /* exponentially overallocate to minimize reallocations */
7357 if (requiredsize < 2*outsize)
7358 requiredsize = 2*outsize;
7359 if (_PyBytes_Resize(outobj, requiredsize))
7360 return -1;
7361 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007362}
7363
Benjamin Peterson14339b62009-01-31 16:36:08 +00007364typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007366} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007368 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 space is available. Return a new reference to the object that
7370 was put in the output buffer, or Py_None, if the mapping was undefined
7371 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007372 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373static charmapencode_result
7374charmapencode_output(Py_UNICODE c, PyObject *mapping,
7375 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007376{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007377 PyObject *rep;
7378 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007379 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007380
Christian Heimes90aa7642007-12-19 02:45:37 +00007381 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007382 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384 if (res == -1)
7385 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 if (outsize<requiredsize)
7387 if (charmapencode_resize(outobj, outpos, requiredsize))
7388 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007389 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 outstart[(*outpos)++] = (char)res;
7391 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007392 }
7393
7394 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007395 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007397 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 Py_DECREF(rep);
7399 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007400 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 if (PyLong_Check(rep)) {
7402 Py_ssize_t requiredsize = *outpos+1;
7403 if (outsize<requiredsize)
7404 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7405 Py_DECREF(rep);
7406 return enc_EXCEPTION;
7407 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007408 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 else {
7412 const char *repchars = PyBytes_AS_STRING(rep);
7413 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7414 Py_ssize_t requiredsize = *outpos+repsize;
7415 if (outsize<requiredsize)
7416 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7417 Py_DECREF(rep);
7418 return enc_EXCEPTION;
7419 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007420 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 memcpy(outstart + *outpos, repchars, repsize);
7422 *outpos += repsize;
7423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007424 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007425 Py_DECREF(rep);
7426 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007427}
7428
7429/* handle an error in PyUnicode_EncodeCharmap
7430 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007431static int
7432charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007433 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007434 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007435 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007436 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437{
7438 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007439 Py_ssize_t repsize;
7440 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441 Py_UNICODE *uni2;
7442 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007443 Py_ssize_t collstartpos = *inpos;
7444 Py_ssize_t collendpos = *inpos+1;
7445 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 char *encoding = "charmap";
7447 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007448 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 /* find all unencodable characters */
7451 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007453 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 int res = encoding_map_lookup(p[collendpos], mapping);
7455 if (res != -1)
7456 break;
7457 ++collendpos;
7458 continue;
7459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007460
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 rep = charmapencode_lookup(p[collendpos], mapping);
7462 if (rep==NULL)
7463 return -1;
7464 else if (rep!=Py_None) {
7465 Py_DECREF(rep);
7466 break;
7467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007468 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 }
7471 /* cache callback name lookup
7472 * (if not done yet, i.e. it's the first error) */
7473 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 if ((errors==NULL) || (!strcmp(errors, "strict")))
7475 *known_errorHandler = 1;
7476 else if (!strcmp(errors, "replace"))
7477 *known_errorHandler = 2;
7478 else if (!strcmp(errors, "ignore"))
7479 *known_errorHandler = 3;
7480 else if (!strcmp(errors, "xmlcharrefreplace"))
7481 *known_errorHandler = 4;
7482 else
7483 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007484 }
7485 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007486 case 1: /* strict */
7487 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7488 return -1;
7489 case 2: /* replace */
7490 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 x = charmapencode_output('?', mapping, res, respos);
7492 if (x==enc_EXCEPTION) {
7493 return -1;
7494 }
7495 else if (x==enc_FAILED) {
7496 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7497 return -1;
7498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007499 }
7500 /* fall through */
7501 case 3: /* ignore */
7502 *inpos = collendpos;
7503 break;
7504 case 4: /* xmlcharrefreplace */
7505 /* generate replacement (temporarily (mis)uses p) */
7506 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 char buffer[2+29+1+1];
7508 char *cp;
7509 sprintf(buffer, "&#%d;", (int)p[collpos]);
7510 for (cp = buffer; *cp; ++cp) {
7511 x = charmapencode_output(*cp, mapping, res, respos);
7512 if (x==enc_EXCEPTION)
7513 return -1;
7514 else if (x==enc_FAILED) {
7515 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7516 return -1;
7517 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007518 }
7519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007520 *inpos = collendpos;
7521 break;
7522 default:
7523 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 encoding, reason, p, size, exceptionObject,
7525 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007526 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007528 if (PyBytes_Check(repunicode)) {
7529 /* Directly copy bytes result to output. */
7530 Py_ssize_t outsize = PyBytes_Size(*res);
7531 Py_ssize_t requiredsize;
7532 repsize = PyBytes_Size(repunicode);
7533 requiredsize = *respos + repsize;
7534 if (requiredsize > outsize)
7535 /* Make room for all additional bytes. */
7536 if (charmapencode_resize(res, respos, requiredsize)) {
7537 Py_DECREF(repunicode);
7538 return -1;
7539 }
7540 memcpy(PyBytes_AsString(*res) + *respos,
7541 PyBytes_AsString(repunicode), repsize);
7542 *respos += repsize;
7543 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007544 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007545 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007547 /* generate replacement */
7548 repsize = PyUnicode_GET_SIZE(repunicode);
7549 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 x = charmapencode_output(*uni2, mapping, res, respos);
7551 if (x==enc_EXCEPTION) {
7552 return -1;
7553 }
7554 else if (x==enc_FAILED) {
7555 Py_DECREF(repunicode);
7556 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7557 return -1;
7558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007559 }
7560 *inpos = newpos;
7561 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 }
7563 return 0;
7564}
7565
Alexander Belopolsky40018472011-02-26 01:02:56 +00007566PyObject *
7567PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7568 Py_ssize_t size,
7569 PyObject *mapping,
7570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007572 /* output object */
7573 PyObject *res = NULL;
7574 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007577 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578 PyObject *errorHandler = NULL;
7579 PyObject *exc = NULL;
7580 /* the following variable is used for caching string comparisons
7581 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7582 * 3=ignore, 4=xmlcharrefreplace */
7583 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585 /* Default to Latin-1 */
7586 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 /* allocate enough for a simple encoding without
7590 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007591 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 if (res == NULL)
7593 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007594 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007597 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 /* try to encode it */
7599 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7600 if (x==enc_EXCEPTION) /* error */
7601 goto onError;
7602 if (x==enc_FAILED) { /* unencodable character */
7603 if (charmap_encoding_error(p, size, &inpos, mapping,
7604 &exc,
7605 &known_errorHandler, &errorHandler, errors,
7606 &res, &respos)) {
7607 goto onError;
7608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 else
7611 /* done with this character => adjust input position */
7612 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007616 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007617 if (_PyBytes_Resize(&res, respos) < 0)
7618 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007620 Py_XDECREF(exc);
7621 Py_XDECREF(errorHandler);
7622 return res;
7623
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007625 Py_XDECREF(res);
7626 Py_XDECREF(exc);
7627 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 return NULL;
7629}
7630
Alexander Belopolsky40018472011-02-26 01:02:56 +00007631PyObject *
7632PyUnicode_AsCharmapString(PyObject *unicode,
7633 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634{
7635 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 PyErr_BadArgument();
7637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 }
7639 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 PyUnicode_GET_SIZE(unicode),
7641 mapping,
7642 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007646static void
7647make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007648 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007649 Py_ssize_t startpos, Py_ssize_t endpos,
7650 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007652 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007653 *exceptionObject = _PyUnicodeTranslateError_Create(
7654 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 }
7656 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7658 goto onError;
7659 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7660 goto onError;
7661 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7662 goto onError;
7663 return;
7664 onError:
7665 Py_DECREF(*exceptionObject);
7666 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 }
7668}
7669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007671static void
7672raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007673 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007674 Py_ssize_t startpos, Py_ssize_t endpos,
7675 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676{
7677 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007678 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681}
7682
7683/* error handling callback helper:
7684 build arguments, call the callback and check the arguments,
7685 put the result into newpos and return the replacement string, which
7686 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007687static PyObject *
7688unicode_translate_call_errorhandler(const char *errors,
7689 PyObject **errorHandler,
7690 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007691 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007692 Py_ssize_t startpos, Py_ssize_t endpos,
7693 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007695 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007697 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698 PyObject *restuple;
7699 PyObject *resunicode;
7700
7701 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 }
7706
7707 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007711
7712 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007717 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_DECREF(restuple);
7719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007720 }
7721 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 &resunicode, &i_newpos)) {
7723 Py_DECREF(restuple);
7724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007726 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007728 else
7729 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7732 Py_DECREF(restuple);
7733 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007735 Py_INCREF(resunicode);
7736 Py_DECREF(restuple);
7737 return resunicode;
7738}
7739
7740/* Lookup the character ch in the mapping and put the result in result,
7741 which must be decrefed by the caller.
7742 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007744charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745{
Christian Heimes217cfd12007-12-02 14:31:20 +00007746 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007747 PyObject *x;
7748
7749 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007751 x = PyObject_GetItem(mapping, w);
7752 Py_DECREF(w);
7753 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7755 /* No mapping found means: use 1:1 mapping. */
7756 PyErr_Clear();
7757 *result = NULL;
7758 return 0;
7759 } else
7760 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007761 }
7762 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 *result = x;
7764 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007766 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 long value = PyLong_AS_LONG(x);
7768 long max = PyUnicode_GetMax();
7769 if (value < 0 || value > max) {
7770 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007771 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 Py_DECREF(x);
7773 return -1;
7774 }
7775 *result = x;
7776 return 0;
7777 }
7778 else if (PyUnicode_Check(x)) {
7779 *result = x;
7780 return 0;
7781 }
7782 else {
7783 /* wrong return value */
7784 PyErr_SetString(PyExc_TypeError,
7785 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786 Py_DECREF(x);
7787 return -1;
7788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789}
7790/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 if not reallocate and adjust various state variables.
7792 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007793static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007798 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 /* exponentially overallocate to minimize reallocations */
7800 if (requiredsize < 2 * oldsize)
7801 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7803 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806 }
7807 return 0;
7808}
7809/* lookup the character, put the result in the output string and adjust
7810 various state variables. Return a new reference to the object that
7811 was put in the output buffer in *result, or Py_None, if the mapping was
7812 undefined (in which case no character was written).
7813 The called must decref result.
7814 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007815static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7817 PyObject *mapping, Py_UCS4 **output,
7818 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007819 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7822 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007824 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 }
7828 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007830 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 }
7834 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 Py_ssize_t repsize;
7836 if (PyUnicode_READY(*res) == -1)
7837 return -1;
7838 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 if (repsize==1) {
7840 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 }
7843 else if (repsize!=0) {
7844 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 Py_ssize_t requiredsize = *opos +
7846 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 Py_ssize_t i;
7849 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 for(i = 0; i < repsize; i++)
7852 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007854 }
7855 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857 return 0;
7858}
7859
Alexander Belopolsky40018472011-02-26 01:02:56 +00007860PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861_PyUnicode_TranslateCharmap(PyObject *input,
7862 PyObject *mapping,
7863 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 /* input object */
7866 char *idata;
7867 Py_ssize_t size, i;
7868 int kind;
7869 /* output buffer */
7870 Py_UCS4 *output = NULL;
7871 Py_ssize_t osize;
7872 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007873 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007874 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875 char *reason = "character maps to <undefined>";
7876 PyObject *errorHandler = NULL;
7877 PyObject *exc = NULL;
7878 /* the following variable is used for caching string comparisons
7879 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7880 * 3=ignore, 4=xmlcharrefreplace */
7881 int known_errorHandler = -1;
7882
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 PyErr_BadArgument();
7885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 if (PyUnicode_READY(input) == -1)
7889 return NULL;
7890 idata = (char*)PyUnicode_DATA(input);
7891 kind = PyUnicode_KIND(input);
7892 size = PyUnicode_GET_LENGTH(input);
7893 i = 0;
7894
7895 if (size == 0) {
7896 Py_INCREF(input);
7897 return input;
7898 }
7899
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900 /* allocate enough for a simple 1:1 translation without
7901 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 osize = size;
7903 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7904 opos = 0;
7905 if (output == NULL) {
7906 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 /* try to encode it */
7912 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 if (charmaptranslate_output(input, i, mapping,
7914 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 Py_XDECREF(x);
7916 goto onError;
7917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 else { /* untranslatable character */
7922 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7923 Py_ssize_t repsize;
7924 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007925 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 Py_ssize_t collstart = i;
7928 Py_ssize_t collend = i+1;
7929 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 while (collend < size) {
7933 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 goto onError;
7935 Py_XDECREF(x);
7936 if (x!=Py_None)
7937 break;
7938 ++collend;
7939 }
7940 /* cache callback name lookup
7941 * (if not done yet, i.e. it's the first error) */
7942 if (known_errorHandler==-1) {
7943 if ((errors==NULL) || (!strcmp(errors, "strict")))
7944 known_errorHandler = 1;
7945 else if (!strcmp(errors, "replace"))
7946 known_errorHandler = 2;
7947 else if (!strcmp(errors, "ignore"))
7948 known_errorHandler = 3;
7949 else if (!strcmp(errors, "xmlcharrefreplace"))
7950 known_errorHandler = 4;
7951 else
7952 known_errorHandler = 0;
7953 }
7954 switch (known_errorHandler) {
7955 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 raise_translate_exception(&exc, input, collstart,
7957 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007958 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 case 2: /* replace */
7960 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007961 for (coll = collstart; coll<collend; coll++)
7962 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 /* fall through */
7964 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007965 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 break;
7967 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 /* generate replacement (temporarily (mis)uses i) */
7969 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 char buffer[2+29+1+1];
7971 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7973 if (charmaptranslate_makespace(&output, &osize,
7974 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 goto onError;
7976 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 break;
7981 default:
7982 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 reason, input, &exc,
7984 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007985 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 goto onError;
7987 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 repsize = PyUnicode_GET_LENGTH(repunicode);
7989 if (charmaptranslate_makespace(&output, &osize,
7990 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_DECREF(repunicode);
7992 goto onError;
7993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007994 for (uni2 = 0; repsize-->0; ++uni2)
7995 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7996 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007998 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 }
8000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8002 if (!res)
8003 goto onError;
8004 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 Py_XDECREF(exc);
8006 Py_XDECREF(errorHandler);
8007 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008010 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 Py_XDECREF(exc);
8012 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 return NULL;
8014}
8015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008016/* Deprecated. Use PyUnicode_Translate instead. */
8017PyObject *
8018PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8019 Py_ssize_t size,
8020 PyObject *mapping,
8021 const char *errors)
8022{
8023 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8024 if (!unicode)
8025 return NULL;
8026 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8027}
8028
Alexander Belopolsky40018472011-02-26 01:02:56 +00008029PyObject *
8030PyUnicode_Translate(PyObject *str,
8031 PyObject *mapping,
8032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
8034 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008035
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 str = PyUnicode_FromObject(str);
8037 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008039 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 Py_DECREF(str);
8041 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008042
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 Py_XDECREF(str);
8045 return NULL;
8046}
Tim Petersced69f82003-09-16 20:30:58 +00008047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048static Py_UCS4
8049fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8050{
8051 /* No need to call PyUnicode_READY(self) because this function is only
8052 called as a callback from fixup() which does it already. */
8053 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8054 const int kind = PyUnicode_KIND(self);
8055 void *data = PyUnicode_DATA(self);
8056 Py_UCS4 maxchar = 0, ch, fixed;
8057 Py_ssize_t i;
8058
8059 for (i = 0; i < len; ++i) {
8060 ch = PyUnicode_READ(kind, data, i);
8061 fixed = 0;
8062 if (ch > 127) {
8063 if (Py_UNICODE_ISSPACE(ch))
8064 fixed = ' ';
8065 else {
8066 const int decimal = Py_UNICODE_TODECIMAL(ch);
8067 if (decimal >= 0)
8068 fixed = '0' + decimal;
8069 }
8070 if (fixed != 0) {
8071 if (fixed > maxchar)
8072 maxchar = fixed;
8073 PyUnicode_WRITE(kind, data, i, fixed);
8074 }
8075 else if (ch > maxchar)
8076 maxchar = ch;
8077 }
8078 else if (ch > maxchar)
8079 maxchar = ch;
8080 }
8081
8082 return maxchar;
8083}
8084
8085PyObject *
8086_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8087{
8088 if (!PyUnicode_Check(unicode)) {
8089 PyErr_BadInternalCall();
8090 return NULL;
8091 }
8092 if (PyUnicode_READY(unicode) == -1)
8093 return NULL;
8094 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8095 /* If the string is already ASCII, just return the same string */
8096 Py_INCREF(unicode);
8097 return unicode;
8098 }
8099 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8100}
8101
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008102PyObject *
8103PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8104 Py_ssize_t length)
8105{
8106 PyObject *result;
8107 Py_UNICODE *p; /* write pointer into result */
8108 Py_ssize_t i;
8109 /* Copy to a new string */
8110 result = (PyObject *)_PyUnicode_New(length);
8111 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8112 if (result == NULL)
8113 return result;
8114 p = PyUnicode_AS_UNICODE(result);
8115 /* Iterate over code points */
8116 for (i = 0; i < length; i++) {
8117 Py_UNICODE ch =s[i];
8118 if (ch > 127) {
8119 int decimal = Py_UNICODE_TODECIMAL(ch);
8120 if (decimal >= 0)
8121 p[i] = '0' + decimal;
8122 }
8123 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008124#ifndef DONT_MAKE_RESULT_READY
8125 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 Py_DECREF(result);
8127 return NULL;
8128 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008129#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008130 return result;
8131}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008132/* --- Decimal Encoder ---------------------------------------------------- */
8133
Alexander Belopolsky40018472011-02-26 01:02:56 +00008134int
8135PyUnicode_EncodeDecimal(Py_UNICODE *s,
8136 Py_ssize_t length,
8137 char *output,
8138 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008139{
8140 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 PyObject *errorHandler = NULL;
8142 PyObject *exc = NULL;
8143 const char *encoding = "decimal";
8144 const char *reason = "invalid decimal Unicode string";
8145 /* the following variable is used for caching string comparisons
8146 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8147 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008148
8149 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 PyErr_BadArgument();
8151 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008152 }
8153
8154 p = s;
8155 end = s + length;
8156 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 register Py_UNICODE ch = *p;
8158 int decimal;
8159 PyObject *repunicode;
8160 Py_ssize_t repsize;
8161 Py_ssize_t newpos;
8162 Py_UNICODE *uni2;
8163 Py_UNICODE *collstart;
8164 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008165
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 ++p;
8169 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 decimal = Py_UNICODE_TODECIMAL(ch);
8172 if (decimal >= 0) {
8173 *output++ = '0' + decimal;
8174 ++p;
8175 continue;
8176 }
8177 if (0 < ch && ch < 256) {
8178 *output++ = (char)ch;
8179 ++p;
8180 continue;
8181 }
8182 /* All other characters are considered unencodable */
8183 collstart = p;
8184 collend = p+1;
8185 while (collend < end) {
8186 if ((0 < *collend && *collend < 256) ||
8187 !Py_UNICODE_ISSPACE(*collend) ||
8188 Py_UNICODE_TODECIMAL(*collend))
8189 break;
8190 }
8191 /* cache callback name lookup
8192 * (if not done yet, i.e. it's the first error) */
8193 if (known_errorHandler==-1) {
8194 if ((errors==NULL) || (!strcmp(errors, "strict")))
8195 known_errorHandler = 1;
8196 else if (!strcmp(errors, "replace"))
8197 known_errorHandler = 2;
8198 else if (!strcmp(errors, "ignore"))
8199 known_errorHandler = 3;
8200 else if (!strcmp(errors, "xmlcharrefreplace"))
8201 known_errorHandler = 4;
8202 else
8203 known_errorHandler = 0;
8204 }
8205 switch (known_errorHandler) {
8206 case 1: /* strict */
8207 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8208 goto onError;
8209 case 2: /* replace */
8210 for (p = collstart; p < collend; ++p)
8211 *output++ = '?';
8212 /* fall through */
8213 case 3: /* ignore */
8214 p = collend;
8215 break;
8216 case 4: /* xmlcharrefreplace */
8217 /* generate replacement (temporarily (mis)uses p) */
8218 for (p = collstart; p < collend; ++p)
8219 output += sprintf(output, "&#%d;", (int)*p);
8220 p = collend;
8221 break;
8222 default:
8223 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8224 encoding, reason, s, length, &exc,
8225 collstart-s, collend-s, &newpos);
8226 if (repunicode == NULL)
8227 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008228 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008229 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008230 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8231 Py_DECREF(repunicode);
8232 goto onError;
8233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 /* generate replacement */
8235 repsize = PyUnicode_GET_SIZE(repunicode);
8236 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8237 Py_UNICODE ch = *uni2;
8238 if (Py_UNICODE_ISSPACE(ch))
8239 *output++ = ' ';
8240 else {
8241 decimal = Py_UNICODE_TODECIMAL(ch);
8242 if (decimal >= 0)
8243 *output++ = '0' + decimal;
8244 else if (0 < ch && ch < 256)
8245 *output++ = (char)ch;
8246 else {
8247 Py_DECREF(repunicode);
8248 raise_encode_exception(&exc, encoding,
8249 s, length, collstart-s, collend-s, reason);
8250 goto onError;
8251 }
8252 }
8253 }
8254 p = s + newpos;
8255 Py_DECREF(repunicode);
8256 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008257 }
8258 /* 0-terminate the output string */
8259 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 Py_XDECREF(exc);
8261 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008262 return 0;
8263
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 Py_XDECREF(exc);
8266 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008267 return -1;
8268}
8269
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270/* --- Helpers ------------------------------------------------------------ */
8271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272#include "stringlib/ucs1lib.h"
8273#include "stringlib/fastsearch.h"
8274#include "stringlib/partition.h"
8275#include "stringlib/split.h"
8276#include "stringlib/count.h"
8277#include "stringlib/find.h"
8278#include "stringlib/localeutil.h"
8279#include "stringlib/undef.h"
8280
8281#include "stringlib/ucs2lib.h"
8282#include "stringlib/fastsearch.h"
8283#include "stringlib/partition.h"
8284#include "stringlib/split.h"
8285#include "stringlib/count.h"
8286#include "stringlib/find.h"
8287#include "stringlib/localeutil.h"
8288#include "stringlib/undef.h"
8289
8290#include "stringlib/ucs4lib.h"
8291#include "stringlib/fastsearch.h"
8292#include "stringlib/partition.h"
8293#include "stringlib/split.h"
8294#include "stringlib/count.h"
8295#include "stringlib/find.h"
8296#include "stringlib/localeutil.h"
8297#include "stringlib/undef.h"
8298
8299static Py_ssize_t
8300any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8301 const Py_UCS1*, Py_ssize_t,
8302 Py_ssize_t, Py_ssize_t),
8303 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8304 const Py_UCS2*, Py_ssize_t,
8305 Py_ssize_t, Py_ssize_t),
8306 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8307 const Py_UCS4*, Py_ssize_t,
8308 Py_ssize_t, Py_ssize_t),
8309 PyObject* s1, PyObject* s2,
8310 Py_ssize_t start,
8311 Py_ssize_t end)
8312{
8313 int kind1, kind2, kind;
8314 void *buf1, *buf2;
8315 Py_ssize_t len1, len2, result;
8316
8317 kind1 = PyUnicode_KIND(s1);
8318 kind2 = PyUnicode_KIND(s2);
8319 kind = kind1 > kind2 ? kind1 : kind2;
8320 buf1 = PyUnicode_DATA(s1);
8321 buf2 = PyUnicode_DATA(s2);
8322 if (kind1 != kind)
8323 buf1 = _PyUnicode_AsKind(s1, kind);
8324 if (!buf1)
8325 return -2;
8326 if (kind2 != kind)
8327 buf2 = _PyUnicode_AsKind(s2, kind);
8328 if (!buf2) {
8329 if (kind1 != kind) PyMem_Free(buf1);
8330 return -2;
8331 }
8332 len1 = PyUnicode_GET_LENGTH(s1);
8333 len2 = PyUnicode_GET_LENGTH(s2);
8334
8335 switch(kind) {
8336 case PyUnicode_1BYTE_KIND:
8337 result = ucs1(buf1, len1, buf2, len2, start, end);
8338 break;
8339 case PyUnicode_2BYTE_KIND:
8340 result = ucs2(buf1, len1, buf2, len2, start, end);
8341 break;
8342 case PyUnicode_4BYTE_KIND:
8343 result = ucs4(buf1, len1, buf2, len2, start, end);
8344 break;
8345 default:
8346 assert(0); result = -2;
8347 }
8348
8349 if (kind1 != kind)
8350 PyMem_Free(buf1);
8351 if (kind2 != kind)
8352 PyMem_Free(buf2);
8353
8354 return result;
8355}
8356
8357Py_ssize_t
8358_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8359 Py_ssize_t n_buffer,
8360 void *digits, Py_ssize_t n_digits,
8361 Py_ssize_t min_width,
8362 const char *grouping,
8363 const char *thousands_sep)
8364{
8365 switch(kind) {
8366 case PyUnicode_1BYTE_KIND:
8367 return _PyUnicode_ucs1_InsertThousandsGrouping(
8368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8369 min_width, grouping, thousands_sep);
8370 case PyUnicode_2BYTE_KIND:
8371 return _PyUnicode_ucs2_InsertThousandsGrouping(
8372 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8373 min_width, grouping, thousands_sep);
8374 case PyUnicode_4BYTE_KIND:
8375 return _PyUnicode_ucs4_InsertThousandsGrouping(
8376 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8377 min_width, grouping, thousands_sep);
8378 }
8379 assert(0);
8380 return -1;
8381}
8382
8383
Eric Smith8c663262007-08-25 02:26:07 +00008384#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008386
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387#include "stringlib/count.h"
8388#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008389
Thomas Wouters477c8d52006-05-27 19:21:47 +00008390/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008391#define ADJUST_INDICES(start, end, len) \
8392 if (end > len) \
8393 end = len; \
8394 else if (end < 0) { \
8395 end += len; \
8396 if (end < 0) \
8397 end = 0; \
8398 } \
8399 if (start < 0) { \
8400 start += len; \
8401 if (start < 0) \
8402 start = 0; \
8403 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008404
Alexander Belopolsky40018472011-02-26 01:02:56 +00008405Py_ssize_t
8406PyUnicode_Count(PyObject *str,
8407 PyObject *substr,
8408 Py_ssize_t start,
8409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008411 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008412 PyUnicodeObject* str_obj;
8413 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 int kind1, kind2, kind;
8415 void *buf1 = NULL, *buf2 = NULL;
8416 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008417
Thomas Wouters477c8d52006-05-27 19:21:47 +00008418 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008421 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008422 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 Py_DECREF(str_obj);
8424 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 }
Tim Petersced69f82003-09-16 20:30:58 +00008426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 kind1 = PyUnicode_KIND(str_obj);
8428 kind2 = PyUnicode_KIND(sub_obj);
8429 kind = kind1 > kind2 ? kind1 : kind2;
8430 buf1 = PyUnicode_DATA(str_obj);
8431 if (kind1 != kind)
8432 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8433 if (!buf1)
8434 goto onError;
8435 buf2 = PyUnicode_DATA(sub_obj);
8436 if (kind2 != kind)
8437 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8438 if (!buf2)
8439 goto onError;
8440 len1 = PyUnicode_GET_LENGTH(str_obj);
8441 len2 = PyUnicode_GET_LENGTH(sub_obj);
8442
8443 ADJUST_INDICES(start, end, len1);
8444 switch(kind) {
8445 case PyUnicode_1BYTE_KIND:
8446 result = ucs1lib_count(
8447 ((Py_UCS1*)buf1) + start, end - start,
8448 buf2, len2, PY_SSIZE_T_MAX
8449 );
8450 break;
8451 case PyUnicode_2BYTE_KIND:
8452 result = ucs2lib_count(
8453 ((Py_UCS2*)buf1) + start, end - start,
8454 buf2, len2, PY_SSIZE_T_MAX
8455 );
8456 break;
8457 case PyUnicode_4BYTE_KIND:
8458 result = ucs4lib_count(
8459 ((Py_UCS4*)buf1) + start, end - start,
8460 buf2, len2, PY_SSIZE_T_MAX
8461 );
8462 break;
8463 default:
8464 assert(0); result = 0;
8465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008466
8467 Py_DECREF(sub_obj);
8468 Py_DECREF(str_obj);
8469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 if (kind1 != kind)
8471 PyMem_Free(buf1);
8472 if (kind2 != kind)
8473 PyMem_Free(buf2);
8474
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 onError:
8477 Py_DECREF(sub_obj);
8478 Py_DECREF(str_obj);
8479 if (kind1 != kind && buf1)
8480 PyMem_Free(buf1);
8481 if (kind2 != kind && buf2)
8482 PyMem_Free(buf2);
8483 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484}
8485
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486Py_ssize_t
8487PyUnicode_Find(PyObject *str,
8488 PyObject *sub,
8489 Py_ssize_t start,
8490 Py_ssize_t end,
8491 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008494
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008498 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 Py_DECREF(str);
8501 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 }
Tim Petersced69f82003-09-16 20:30:58 +00008503
Thomas Wouters477c8d52006-05-27 19:21:47 +00008504 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 result = any_find_slice(
8506 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8507 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008508 );
8509 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 result = any_find_slice(
8511 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8512 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008513 );
8514
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008516 Py_DECREF(sub);
8517
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 return result;
8519}
8520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521Py_ssize_t
8522PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8523 Py_ssize_t start, Py_ssize_t end,
8524 int direction)
8525{
8526 char *result;
8527 int kind;
8528 if (PyUnicode_READY(str) == -1)
8529 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008530 if (start < 0 || end < 0) {
8531 PyErr_SetString(PyExc_IndexError, "string index out of range");
8532 return -2;
8533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 if (end > PyUnicode_GET_LENGTH(str))
8535 end = PyUnicode_GET_LENGTH(str);
8536 kind = PyUnicode_KIND(str);
8537 result = findchar(PyUnicode_1BYTE_DATA(str)
8538 + PyUnicode_KIND_SIZE(kind, start),
8539 kind,
8540 end-start, ch, direction);
8541 if (!result)
8542 return -1;
8543 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8544}
8545
Alexander Belopolsky40018472011-02-26 01:02:56 +00008546static int
8547tailmatch(PyUnicodeObject *self,
8548 PyUnicodeObject *substring,
8549 Py_ssize_t start,
8550 Py_ssize_t end,
8551 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 int kind_self;
8554 int kind_sub;
8555 void *data_self;
8556 void *data_sub;
8557 Py_ssize_t offset;
8558 Py_ssize_t i;
8559 Py_ssize_t end_sub;
8560
8561 if (PyUnicode_READY(self) == -1 ||
8562 PyUnicode_READY(substring) == -1)
8563 return 0;
8564
8565 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 return 1;
8567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8569 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 kind_self = PyUnicode_KIND(self);
8574 data_self = PyUnicode_DATA(self);
8575 kind_sub = PyUnicode_KIND(substring);
8576 data_sub = PyUnicode_DATA(substring);
8577 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8578
8579 if (direction > 0)
8580 offset = end;
8581 else
8582 offset = start;
8583
8584 if (PyUnicode_READ(kind_self, data_self, offset) ==
8585 PyUnicode_READ(kind_sub, data_sub, 0) &&
8586 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8587 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8588 /* If both are of the same kind, memcmp is sufficient */
8589 if (kind_self == kind_sub) {
8590 return ! memcmp((char *)data_self +
8591 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8592 data_sub,
8593 PyUnicode_GET_LENGTH(substring) *
8594 PyUnicode_CHARACTER_SIZE(substring));
8595 }
8596 /* otherwise we have to compare each character by first accesing it */
8597 else {
8598 /* We do not need to compare 0 and len(substring)-1 because
8599 the if statement above ensured already that they are equal
8600 when we end up here. */
8601 // TODO: honor direction and do a forward or backwards search
8602 for (i = 1; i < end_sub; ++i) {
8603 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8604 PyUnicode_READ(kind_sub, data_sub, i))
8605 return 0;
8606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 }
8610
8611 return 0;
8612}
8613
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614Py_ssize_t
8615PyUnicode_Tailmatch(PyObject *str,
8616 PyObject *substr,
8617 Py_ssize_t start,
8618 Py_ssize_t end,
8619 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008621 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008622
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 str = PyUnicode_FromObject(str);
8624 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 substr = PyUnicode_FromObject(substr);
8627 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 Py_DECREF(str);
8629 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 }
Tim Petersced69f82003-09-16 20:30:58 +00008631
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 (PyUnicodeObject *)substr,
8634 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 Py_DECREF(str);
8636 Py_DECREF(substr);
8637 return result;
8638}
8639
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640/* Apply fixfct filter to the Unicode object self and return a
8641 reference to the modified object */
8642
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643static PyObject *
8644fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 PyObject *u;
8648 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 if (PyUnicode_READY(self) == -1)
8651 return NULL;
8652 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8653 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8654 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8659 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 /* fix functions return the new maximum character in a string,
8662 if the kind of the resulting unicode object does not change,
8663 everything is fine. Otherwise we need to change the string kind
8664 and re-run the fix function. */
8665 maxchar_new = fixfct((PyUnicodeObject*)u);
8666 if (maxchar_new == 0)
8667 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8668 else if (maxchar_new <= 127)
8669 maxchar_new = 127;
8670 else if (maxchar_new <= 255)
8671 maxchar_new = 255;
8672 else if (maxchar_new <= 65535)
8673 maxchar_new = 65535;
8674 else
8675 maxchar_new = 1114111; /* 0x10ffff */
8676
8677 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 /* fixfct should return TRUE if it modified the buffer. If
8679 FALSE, return a reference to the original buffer instead
8680 (to save space, not time) */
8681 Py_INCREF(self);
8682 Py_DECREF(u);
8683 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 else if (maxchar_new == maxchar_old) {
8686 return u;
8687 }
8688 else {
8689 /* In case the maximum character changed, we need to
8690 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008691 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 if (v == NULL) {
8693 Py_DECREF(u);
8694 return NULL;
8695 }
8696 if (maxchar_new > maxchar_old) {
8697 /* If the maxchar increased so that the kind changed, not all
8698 characters are representable anymore and we need to fix the
8699 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008700 if (PyUnicode_CopyCharacters(v, 0,
8701 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008702 PyUnicode_GET_LENGTH(self)) < 0)
8703 {
8704 Py_DECREF(u);
8705 return NULL;
8706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 maxchar_old = fixfct((PyUnicodeObject*)v);
8708 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8709 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008710 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008711 if (PyUnicode_CopyCharacters(v, 0,
8712 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008713 PyUnicode_GET_LENGTH(self)) < 0)
8714 {
8715 Py_DECREF(u);
8716 return NULL;
8717 }
8718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719
8720 Py_DECREF(u);
8721 return v;
8722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723}
8724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008726fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 /* No need to call PyUnicode_READY(self) because this function is only
8729 called as a callback from fixup() which does it already. */
8730 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8731 const int kind = PyUnicode_KIND(self);
8732 void *data = PyUnicode_DATA(self);
8733 int touched = 0;
8734 Py_UCS4 maxchar = 0;
8735 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 for (i = 0; i < len; ++i) {
8738 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8739 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8740 if (up != ch) {
8741 if (up > maxchar)
8742 maxchar = up;
8743 PyUnicode_WRITE(kind, data, i, up);
8744 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 else if (ch > maxchar)
8747 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
8749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (touched)
8751 return maxchar;
8752 else
8753 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754}
8755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8760 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8761 const int kind = PyUnicode_KIND(self);
8762 void *data = PyUnicode_DATA(self);
8763 int touched = 0;
8764 Py_UCS4 maxchar = 0;
8765 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 for(i = 0; i < len; ++i) {
8768 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8769 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8770 if (lo != ch) {
8771 if (lo > maxchar)
8772 maxchar = lo;
8773 PyUnicode_WRITE(kind, data, i, lo);
8774 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 else if (ch > maxchar)
8777 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 }
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 if (touched)
8781 return maxchar;
8782 else
8783 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784}
8785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008787fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8790 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8791 const int kind = PyUnicode_KIND(self);
8792 void *data = PyUnicode_DATA(self);
8793 int touched = 0;
8794 Py_UCS4 maxchar = 0;
8795 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 for(i = 0; i < len; ++i) {
8798 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8799 Py_UCS4 nu = 0;
8800
8801 if (Py_UNICODE_ISUPPER(ch))
8802 nu = Py_UNICODE_TOLOWER(ch);
8803 else if (Py_UNICODE_ISLOWER(ch))
8804 nu = Py_UNICODE_TOUPPER(ch);
8805
8806 if (nu != 0) {
8807 if (nu > maxchar)
8808 maxchar = nu;
8809 PyUnicode_WRITE(kind, data, i, nu);
8810 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 else if (ch > maxchar)
8813 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 }
8815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 if (touched)
8817 return maxchar;
8818 else
8819 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820}
8821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008823fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8826 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8827 const int kind = PyUnicode_KIND(self);
8828 void *data = PyUnicode_DATA(self);
8829 int touched = 0;
8830 Py_UCS4 maxchar = 0;
8831 Py_ssize_t i = 0;
8832 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008833
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008834 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836
8837 ch = PyUnicode_READ(kind, data, i);
8838 if (!Py_UNICODE_ISUPPER(ch)) {
8839 maxchar = Py_UNICODE_TOUPPER(ch);
8840 PyUnicode_WRITE(kind, data, i, maxchar);
8841 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 ++i;
8844 for(; i < len; ++i) {
8845 ch = PyUnicode_READ(kind, data, i);
8846 if (!Py_UNICODE_ISLOWER(ch)) {
8847 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8848 if (lo > maxchar)
8849 maxchar = lo;
8850 PyUnicode_WRITE(kind, data, i, lo);
8851 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 else if (ch > maxchar)
8854 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856
8857 if (touched)
8858 return maxchar;
8859 else
8860 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861}
8862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008864fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8867 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8868 const int kind = PyUnicode_KIND(self);
8869 void *data = PyUnicode_DATA(self);
8870 Py_UCS4 maxchar = 0;
8871 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 int previous_is_cased;
8873
8874 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 if (len == 1) {
8876 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8877 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8878 if (ti != ch) {
8879 PyUnicode_WRITE(kind, data, i, ti);
8880 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 }
8882 else
8883 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 for(; i < len; ++i) {
8887 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8888 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008889
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 nu = Py_UNICODE_TOTITLE(ch);
8894
8895 if (nu > maxchar)
8896 maxchar = nu;
8897 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008898
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 if (Py_UNICODE_ISLOWER(ch) ||
8900 Py_UNICODE_ISUPPER(ch) ||
8901 Py_UNICODE_ISTITLE(ch))
8902 previous_is_cased = 1;
8903 else
8904 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907}
8908
Tim Peters8ce9f162004-08-27 01:49:32 +00008909PyObject *
8910PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008913 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008915 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008916 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8917 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008918 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 Py_ssize_t sz, i, res_offset;
8920 Py_UCS4 maxchar = 0;
8921 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922
Tim Peters05eba1f2004-08-27 21:32:02 +00008923 fseq = PySequence_Fast(seq, "");
8924 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008925 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008926 }
8927
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008928 /* NOTE: the following code can't call back into Python code,
8929 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008930 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008931
Tim Peters05eba1f2004-08-27 21:32:02 +00008932 seqlen = PySequence_Fast_GET_SIZE(fseq);
8933 /* If empty sequence, return u"". */
8934 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008936 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008937 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008938 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008939 /* If singleton sequence with an exact Unicode, return that. */
8940 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 item = items[0];
8942 if (PyUnicode_CheckExact(item)) {
8943 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 goto Done;
8946 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008947 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008948 else {
8949 /* Set up sep and seplen */
8950 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 /* fall back to a blank space separator */
8952 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008953 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008955 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008956 else {
8957 if (!PyUnicode_Check(separator)) {
8958 PyErr_Format(PyExc_TypeError,
8959 "separator: expected str instance,"
8960 " %.80s found",
8961 Py_TYPE(separator)->tp_name);
8962 goto onError;
8963 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008964 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 goto onError;
8966 sep = separator;
8967 seplen = PyUnicode_GET_LENGTH(separator);
8968 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8969 /* inc refcount to keep this code path symetric with the
8970 above case of a blank separator */
8971 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008972 }
8973 }
8974
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008975 /* There are at least two things to join, or else we have a subclass
8976 * of str in the sequence.
8977 * Do a pre-pass to figure out the total amount of space we'll
8978 * need (sz), and see whether all argument are strings.
8979 */
8980 sz = 0;
8981 for (i = 0; i < seqlen; i++) {
8982 const Py_ssize_t old_sz = sz;
8983 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 if (!PyUnicode_Check(item)) {
8985 PyErr_Format(PyExc_TypeError,
8986 "sequence item %zd: expected str instance,"
8987 " %.80s found",
8988 i, Py_TYPE(item)->tp_name);
8989 goto onError;
8990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 if (PyUnicode_READY(item) == -1)
8992 goto onError;
8993 sz += PyUnicode_GET_LENGTH(item);
8994 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8995 if (item_maxchar > maxchar)
8996 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008997 if (i != 0)
8998 sz += seplen;
8999 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9000 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009002 goto onError;
9003 }
9004 }
Tim Petersced69f82003-09-16 20:30:58 +00009005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009007 if (res == NULL)
9008 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009009
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009010 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009012 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009013 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009015 if (i && seplen != 0) {
9016 copied = PyUnicode_CopyCharacters(res, res_offset,
9017 sep, 0, seplen);
9018 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009019 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009020#ifdef Py_DEBUG
9021 res_offset += copied;
9022#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009024#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009026 itemlen = PyUnicode_GET_LENGTH(item);
9027 if (itemlen != 0) {
9028 copied = PyUnicode_CopyCharacters(res, res_offset,
9029 item, 0, itemlen);
9030 if (copied < 0)
9031 goto onError;
9032#ifdef Py_DEBUG
9033 res_offset += copied;
9034#else
9035 res_offset += itemlen;
9036#endif
9037 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009040
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009042 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 Py_XDECREF(sep);
9044 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009047 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009049 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 return NULL;
9051}
9052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053#define FILL(kind, data, value, start, length) \
9054 do { \
9055 Py_ssize_t i_ = 0; \
9056 assert(kind != PyUnicode_WCHAR_KIND); \
9057 switch ((kind)) { \
9058 case PyUnicode_1BYTE_KIND: { \
9059 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9060 memset(to_, (unsigned char)value, length); \
9061 break; \
9062 } \
9063 case PyUnicode_2BYTE_KIND: { \
9064 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9065 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9066 break; \
9067 } \
9068 default: { \
9069 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9070 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9071 break; \
9072 } \
9073 } \
9074 } while (0)
9075
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076static PyUnicodeObject *
9077pad(PyUnicodeObject *self,
9078 Py_ssize_t left,
9079 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 PyObject *u;
9083 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009084 int kind;
9085 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086
9087 if (left < 0)
9088 left = 0;
9089 if (right < 0)
9090 right = 0;
9091
Tim Peters7a29bd52001-09-12 03:03:31 +00009092 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 Py_INCREF(self);
9094 return self;
9095 }
9096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9098 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009099 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9100 return NULL;
9101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9103 if (fill > maxchar)
9104 maxchar = fill;
9105 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009106 if (!u)
9107 return NULL;
9108
9109 kind = PyUnicode_KIND(u);
9110 data = PyUnicode_DATA(u);
9111 if (left)
9112 FILL(kind, data, fill, 0, left);
9113 if (right)
9114 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009115 if (PyUnicode_CopyCharacters(u, left,
9116 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009117 _PyUnicode_LENGTH(self)) < 0)
9118 {
9119 Py_DECREF(u);
9120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 }
9122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126
Alexander Belopolsky40018472011-02-26 01:02:56 +00009127PyObject *
9128PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131
9132 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 switch(PyUnicode_KIND(string)) {
9137 case PyUnicode_1BYTE_KIND:
9138 list = ucs1lib_splitlines(
9139 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9140 PyUnicode_GET_LENGTH(string), keepends);
9141 break;
9142 case PyUnicode_2BYTE_KIND:
9143 list = ucs2lib_splitlines(
9144 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9145 PyUnicode_GET_LENGTH(string), keepends);
9146 break;
9147 case PyUnicode_4BYTE_KIND:
9148 list = ucs4lib_splitlines(
9149 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9150 PyUnicode_GET_LENGTH(string), keepends);
9151 break;
9152 default:
9153 assert(0);
9154 list = 0;
9155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 Py_DECREF(string);
9157 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158}
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160static PyObject *
9161split(PyUnicodeObject *self,
9162 PyUnicodeObject *substring,
9163 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 int kind1, kind2, kind;
9166 void *buf1, *buf2;
9167 Py_ssize_t len1, len2;
9168 PyObject* out;
9169
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009171 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (PyUnicode_READY(self) == -1)
9174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (substring == NULL)
9177 switch(PyUnicode_KIND(self)) {
9178 case PyUnicode_1BYTE_KIND:
9179 return ucs1lib_split_whitespace(
9180 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9181 PyUnicode_GET_LENGTH(self), maxcount
9182 );
9183 case PyUnicode_2BYTE_KIND:
9184 return ucs2lib_split_whitespace(
9185 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9186 PyUnicode_GET_LENGTH(self), maxcount
9187 );
9188 case PyUnicode_4BYTE_KIND:
9189 return ucs4lib_split_whitespace(
9190 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9191 PyUnicode_GET_LENGTH(self), maxcount
9192 );
9193 default:
9194 assert(0);
9195 return NULL;
9196 }
9197
9198 if (PyUnicode_READY(substring) == -1)
9199 return NULL;
9200
9201 kind1 = PyUnicode_KIND(self);
9202 kind2 = PyUnicode_KIND(substring);
9203 kind = kind1 > kind2 ? kind1 : kind2;
9204 buf1 = PyUnicode_DATA(self);
9205 buf2 = PyUnicode_DATA(substring);
9206 if (kind1 != kind)
9207 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9208 if (!buf1)
9209 return NULL;
9210 if (kind2 != kind)
9211 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9212 if (!buf2) {
9213 if (kind1 != kind) PyMem_Free(buf1);
9214 return NULL;
9215 }
9216 len1 = PyUnicode_GET_LENGTH(self);
9217 len2 = PyUnicode_GET_LENGTH(substring);
9218
9219 switch(kind) {
9220 case PyUnicode_1BYTE_KIND:
9221 out = ucs1lib_split(
9222 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9223 break;
9224 case PyUnicode_2BYTE_KIND:
9225 out = ucs2lib_split(
9226 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9227 break;
9228 case PyUnicode_4BYTE_KIND:
9229 out = ucs4lib_split(
9230 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9231 break;
9232 default:
9233 out = NULL;
9234 }
9235 if (kind1 != kind)
9236 PyMem_Free(buf1);
9237 if (kind2 != kind)
9238 PyMem_Free(buf2);
9239 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240}
9241
Alexander Belopolsky40018472011-02-26 01:02:56 +00009242static PyObject *
9243rsplit(PyUnicodeObject *self,
9244 PyUnicodeObject *substring,
9245 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 int kind1, kind2, kind;
9248 void *buf1, *buf2;
9249 Py_ssize_t len1, len2;
9250 PyObject* out;
9251
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009252 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009253 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 if (PyUnicode_READY(self) == -1)
9256 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 if (substring == NULL)
9259 switch(PyUnicode_KIND(self)) {
9260 case PyUnicode_1BYTE_KIND:
9261 return ucs1lib_rsplit_whitespace(
9262 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9263 PyUnicode_GET_LENGTH(self), maxcount
9264 );
9265 case PyUnicode_2BYTE_KIND:
9266 return ucs2lib_rsplit_whitespace(
9267 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9268 PyUnicode_GET_LENGTH(self), maxcount
9269 );
9270 case PyUnicode_4BYTE_KIND:
9271 return ucs4lib_rsplit_whitespace(
9272 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9273 PyUnicode_GET_LENGTH(self), maxcount
9274 );
9275 default:
9276 assert(0);
9277 return NULL;
9278 }
9279
9280 if (PyUnicode_READY(substring) == -1)
9281 return NULL;
9282
9283 kind1 = PyUnicode_KIND(self);
9284 kind2 = PyUnicode_KIND(substring);
9285 kind = kind1 > kind2 ? kind1 : kind2;
9286 buf1 = PyUnicode_DATA(self);
9287 buf2 = PyUnicode_DATA(substring);
9288 if (kind1 != kind)
9289 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9290 if (!buf1)
9291 return NULL;
9292 if (kind2 != kind)
9293 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9294 if (!buf2) {
9295 if (kind1 != kind) PyMem_Free(buf1);
9296 return NULL;
9297 }
9298 len1 = PyUnicode_GET_LENGTH(self);
9299 len2 = PyUnicode_GET_LENGTH(substring);
9300
9301 switch(kind) {
9302 case PyUnicode_1BYTE_KIND:
9303 out = ucs1lib_rsplit(
9304 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9305 break;
9306 case PyUnicode_2BYTE_KIND:
9307 out = ucs2lib_rsplit(
9308 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9309 break;
9310 case PyUnicode_4BYTE_KIND:
9311 out = ucs4lib_rsplit(
9312 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9313 break;
9314 default:
9315 out = NULL;
9316 }
9317 if (kind1 != kind)
9318 PyMem_Free(buf1);
9319 if (kind2 != kind)
9320 PyMem_Free(buf2);
9321 return out;
9322}
9323
9324static Py_ssize_t
9325anylib_find(int kind, void *buf1, Py_ssize_t len1,
9326 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9327{
9328 switch(kind) {
9329 case PyUnicode_1BYTE_KIND:
9330 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9331 case PyUnicode_2BYTE_KIND:
9332 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9333 case PyUnicode_4BYTE_KIND:
9334 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9335 }
9336 assert(0);
9337 return -1;
9338}
9339
9340static Py_ssize_t
9341anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9342 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9343{
9344 switch(kind) {
9345 case PyUnicode_1BYTE_KIND:
9346 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9347 case PyUnicode_2BYTE_KIND:
9348 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9349 case PyUnicode_4BYTE_KIND:
9350 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9351 }
9352 assert(0);
9353 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009354}
9355
Alexander Belopolsky40018472011-02-26 01:02:56 +00009356static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357replace(PyObject *self, PyObject *str1,
9358 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 PyObject *u;
9361 char *sbuf = PyUnicode_DATA(self);
9362 char *buf1 = PyUnicode_DATA(str1);
9363 char *buf2 = PyUnicode_DATA(str2);
9364 int srelease = 0, release1 = 0, release2 = 0;
9365 int skind = PyUnicode_KIND(self);
9366 int kind1 = PyUnicode_KIND(str1);
9367 int kind2 = PyUnicode_KIND(str2);
9368 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9369 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9370 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371
9372 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009375 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (skind < kind1)
9378 /* substring too wide to be present */
9379 goto nothing;
9380
9381 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009382 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009383 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009385 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009387 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 Py_UCS4 u1, u2, maxchar;
9389 int mayshrink, rkind;
9390 u1 = PyUnicode_READ_CHAR(str1, 0);
9391 if (!findchar(sbuf, PyUnicode_KIND(self),
9392 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 u2 = PyUnicode_READ_CHAR(str2, 0);
9395 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9396 /* Replacing u1 with u2 may cause a maxchar reduction in the
9397 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 if (u2 > maxchar) {
9399 maxchar = u2;
9400 mayshrink = 0;
9401 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009402 else
9403 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009405 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009407 if (PyUnicode_CopyCharacters(u, 0,
9408 (PyObject*)self, 0, slen) < 0)
9409 {
9410 Py_DECREF(u);
9411 return NULL;
9412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 rkind = PyUnicode_KIND(u);
9414 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9415 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009416 if (--maxcount < 0)
9417 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 if (mayshrink) {
9421 PyObject *tmp = u;
9422 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9423 PyUnicode_GET_LENGTH(tmp));
9424 Py_DECREF(tmp);
9425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009426 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 int rkind = skind;
9428 char *res;
9429 if (kind1 < rkind) {
9430 /* widen substring */
9431 buf1 = _PyUnicode_AsKind(str1, rkind);
9432 if (!buf1) goto error;
9433 release1 = 1;
9434 }
9435 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009436 if (i < 0)
9437 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 if (rkind > kind2) {
9439 /* widen replacement */
9440 buf2 = _PyUnicode_AsKind(str2, rkind);
9441 if (!buf2) goto error;
9442 release2 = 1;
9443 }
9444 else if (rkind < kind2) {
9445 /* widen self and buf1 */
9446 rkind = kind2;
9447 if (release1) PyMem_Free(buf1);
9448 sbuf = _PyUnicode_AsKind(self, rkind);
9449 if (!sbuf) goto error;
9450 srelease = 1;
9451 buf1 = _PyUnicode_AsKind(str1, rkind);
9452 if (!buf1) goto error;
9453 release1 = 1;
9454 }
9455 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9456 if (!res) {
9457 PyErr_NoMemory();
9458 goto error;
9459 }
9460 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009461 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9463 buf2,
9464 PyUnicode_KIND_SIZE(rkind, len2));
9465 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009466
9467 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9469 slen-i,
9470 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009471 if (i == -1)
9472 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9474 buf2,
9475 PyUnicode_KIND_SIZE(rkind, len2));
9476 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478
9479 u = PyUnicode_FromKindAndData(rkind, res, slen);
9480 PyMem_Free(res);
9481 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 Py_ssize_t n, i, j, ires;
9486 Py_ssize_t product, new_size;
9487 int rkind = skind;
9488 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 if (kind1 < rkind) {
9491 buf1 = _PyUnicode_AsKind(str1, rkind);
9492 if (!buf1) goto error;
9493 release1 = 1;
9494 }
9495 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009496 if (n == 0)
9497 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (kind2 < rkind) {
9499 buf2 = _PyUnicode_AsKind(str2, rkind);
9500 if (!buf2) goto error;
9501 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 else if (kind2 > rkind) {
9504 rkind = kind2;
9505 sbuf = _PyUnicode_AsKind(self, rkind);
9506 if (!sbuf) goto error;
9507 srelease = 1;
9508 if (release1) PyMem_Free(buf1);
9509 buf1 = _PyUnicode_AsKind(str1, rkind);
9510 if (!buf1) goto error;
9511 release1 = 1;
9512 }
9513 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9514 PyUnicode_GET_LENGTH(str1))); */
9515 product = n * (len2-len1);
9516 if ((product / (len2-len1)) != n) {
9517 PyErr_SetString(PyExc_OverflowError,
9518 "replace string is too long");
9519 goto error;
9520 }
9521 new_size = slen + product;
9522 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9523 PyErr_SetString(PyExc_OverflowError,
9524 "replace string is too long");
9525 goto error;
9526 }
9527 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9528 if (!res)
9529 goto error;
9530 ires = i = 0;
9531 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009532 while (n-- > 0) {
9533 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 j = anylib_find(rkind,
9535 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9536 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009537 if (j == -1)
9538 break;
9539 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009540 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9542 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9543 PyUnicode_KIND_SIZE(rkind, j-i));
9544 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009545 }
9546 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 if (len2 > 0) {
9548 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9549 buf2,
9550 PyUnicode_KIND_SIZE(rkind, len2));
9551 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009556 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9558 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9559 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560 } else {
9561 /* interleave */
9562 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9564 buf2,
9565 PyUnicode_KIND_SIZE(rkind, len2));
9566 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009567 if (--n <= 0)
9568 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9570 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9571 PyUnicode_KIND_SIZE(rkind, 1));
9572 ires++;
9573 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9576 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9577 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009580 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 if (srelease)
9583 PyMem_FREE(sbuf);
9584 if (release1)
9585 PyMem_FREE(buf1);
9586 if (release2)
9587 PyMem_FREE(buf2);
9588 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009589
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009591 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 if (srelease)
9593 PyMem_FREE(sbuf);
9594 if (release1)
9595 PyMem_FREE(buf1);
9596 if (release2)
9597 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009598 if (PyUnicode_CheckExact(self)) {
9599 Py_INCREF(self);
9600 return (PyObject *) self;
9601 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009602 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 error:
9604 if (srelease && sbuf)
9605 PyMem_FREE(sbuf);
9606 if (release1 && buf1)
9607 PyMem_FREE(buf1);
9608 if (release2 && buf2)
9609 PyMem_FREE(buf2);
9610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611}
9612
9613/* --- Unicode Object Methods --------------------------------------------- */
9614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009615PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617\n\
9618Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009619characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620
9621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009622unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 return fixup(self, fixtitle);
9625}
9626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009627PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629\n\
9630Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009631have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
9633static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009634unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 return fixup(self, fixcapitalize);
9637}
9638
9639#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009640PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642\n\
9643Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009644normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645
9646static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009647unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648{
9649 PyObject *list;
9650 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009651 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 /* Split into words */
9654 list = split(self, NULL, -1);
9655 if (!list)
9656 return NULL;
9657
9658 /* Capitalize each word */
9659 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9660 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 if (item == NULL)
9663 goto onError;
9664 Py_DECREF(PyList_GET_ITEM(list, i));
9665 PyList_SET_ITEM(list, i, item);
9666 }
9667
9668 /* Join the words to form a new string */
9669 item = PyUnicode_Join(NULL, list);
9670
Benjamin Peterson29060642009-01-31 22:14:21 +00009671 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 Py_DECREF(list);
9673 return (PyObject *)item;
9674}
9675#endif
9676
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009677/* Argument converter. Coerces to a single unicode character */
9678
9679static int
9680convert_uc(PyObject *obj, void *addr)
9681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009683 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009684
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 uniobj = PyUnicode_FromObject(obj);
9686 if (uniobj == NULL) {
9687 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009689 return 0;
9690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009692 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 Py_DECREF(uniobj);
9695 return 0;
9696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 Py_DECREF(uniobj);
9699 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009700}
9701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009702PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009705Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009706done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707
9708static PyObject *
9709unicode_center(PyUnicodeObject *self, PyObject *args)
9710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009711 Py_ssize_t marg, left;
9712 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 Py_UCS4 fillchar = ' ';
9714
Victor Stinnere9a29352011-10-01 02:14:59 +02009715 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717
Victor Stinnere9a29352011-10-01 02:14:59 +02009718 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 return NULL;
9720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 Py_INCREF(self);
9723 return (PyObject*) self;
9724 }
9725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 left = marg / 2 + (marg & width & 1);
9728
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009729 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Marc-André Lemburge5034372000-08-08 08:04:29 +00009732#if 0
9733
9734/* This code should go into some future Unicode collation support
9735 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009736 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009737
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009738/* speedy UTF-16 code point order comparison */
9739/* gleaned from: */
9740/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9741
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009742static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009743{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009744 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009745 0, 0, 0, 0, 0, 0, 0, 0,
9746 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009747 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009748};
9749
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750static int
9751unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9752{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009753 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009754
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755 Py_UNICODE *s1 = str1->str;
9756 Py_UNICODE *s2 = str2->str;
9757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 len1 = str1->_base._base.length;
9759 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009760
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009762 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009763
9764 c1 = *s1++;
9765 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009766
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 if (c1 > (1<<11) * 26)
9768 c1 += utf16Fixup[c1>>11];
9769 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009770 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009771 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009772
9773 if (c1 != c2)
9774 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009775
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009776 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 }
9778
9779 return (len1 < len2) ? -1 : (len1 != len2);
9780}
9781
Marc-André Lemburge5034372000-08-08 08:04:29 +00009782#else
9783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784/* This function assumes that str1 and str2 are readied by the caller. */
9785
Marc-André Lemburge5034372000-08-08 08:04:29 +00009786static int
9787unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 int kind1, kind2;
9790 void *data1, *data2;
9791 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 kind1 = PyUnicode_KIND(str1);
9794 kind2 = PyUnicode_KIND(str2);
9795 data1 = PyUnicode_DATA(str1);
9796 data2 = PyUnicode_DATA(str2);
9797 len1 = PyUnicode_GET_LENGTH(str1);
9798 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 for (i = 0; i < len1 && i < len2; ++i) {
9801 Py_UCS4 c1, c2;
9802 c1 = PyUnicode_READ(kind1, data1, i);
9803 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009804
9805 if (c1 != c2)
9806 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009807 }
9808
9809 return (len1 < len2) ? -1 : (len1 != len2);
9810}
9811
9812#endif
9813
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814int
9815PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9818 if (PyUnicode_READY(left) == -1 ||
9819 PyUnicode_READY(right) == -1)
9820 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009821 return unicode_compare((PyUnicodeObject *)left,
9822 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009824 PyErr_Format(PyExc_TypeError,
9825 "Can't compare %.100s and %.100s",
9826 left->ob_type->tp_name,
9827 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 return -1;
9829}
9830
Martin v. Löwis5b222132007-06-10 09:51:05 +00009831int
9832PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 Py_ssize_t i;
9835 int kind;
9836 void *data;
9837 Py_UCS4 chr;
9838
Victor Stinner910337b2011-10-03 03:20:16 +02009839 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 if (PyUnicode_READY(uni) == -1)
9841 return -1;
9842 kind = PyUnicode_KIND(uni);
9843 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009844 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9846 if (chr != str[i])
9847 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009848 /* This check keeps Python strings that end in '\0' from comparing equal
9849 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009851 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009852 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009854 return 0;
9855}
9856
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009857
Benjamin Peterson29060642009-01-31 22:14:21 +00009858#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009859 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861PyObject *
9862PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009863{
9864 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009865
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009866 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9867 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 if (PyUnicode_READY(left) == -1 ||
9869 PyUnicode_READY(right) == -1)
9870 return NULL;
9871 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9872 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009873 if (op == Py_EQ) {
9874 Py_INCREF(Py_False);
9875 return Py_False;
9876 }
9877 if (op == Py_NE) {
9878 Py_INCREF(Py_True);
9879 return Py_True;
9880 }
9881 }
9882 if (left == right)
9883 result = 0;
9884 else
9885 result = unicode_compare((PyUnicodeObject *)left,
9886 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009887
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009888 /* Convert the return value to a Boolean */
9889 switch (op) {
9890 case Py_EQ:
9891 v = TEST_COND(result == 0);
9892 break;
9893 case Py_NE:
9894 v = TEST_COND(result != 0);
9895 break;
9896 case Py_LE:
9897 v = TEST_COND(result <= 0);
9898 break;
9899 case Py_GE:
9900 v = TEST_COND(result >= 0);
9901 break;
9902 case Py_LT:
9903 v = TEST_COND(result == -1);
9904 break;
9905 case Py_GT:
9906 v = TEST_COND(result == 1);
9907 break;
9908 default:
9909 PyErr_BadArgument();
9910 return NULL;
9911 }
9912 Py_INCREF(v);
9913 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009914 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915
Brian Curtindfc80e32011-08-10 20:28:54 -05009916 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009917}
9918
Alexander Belopolsky40018472011-02-26 01:02:56 +00009919int
9920PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009921{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009922 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 int kind1, kind2, kind;
9924 void *buf1, *buf2;
9925 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009926 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009927
9928 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009929 sub = PyUnicode_FromObject(element);
9930 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 PyErr_Format(PyExc_TypeError,
9932 "'in <string>' requires string as left operand, not %s",
9933 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009934 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 if (PyUnicode_READY(sub) == -1)
9937 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009938
Thomas Wouters477c8d52006-05-27 19:21:47 +00009939 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009940 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009941 Py_DECREF(sub);
9942 return -1;
9943 }
9944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 kind1 = PyUnicode_KIND(str);
9946 kind2 = PyUnicode_KIND(sub);
9947 kind = kind1 > kind2 ? kind1 : kind2;
9948 buf1 = PyUnicode_DATA(str);
9949 buf2 = PyUnicode_DATA(sub);
9950 if (kind1 != kind)
9951 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9952 if (!buf1) {
9953 Py_DECREF(sub);
9954 return -1;
9955 }
9956 if (kind2 != kind)
9957 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9958 if (!buf2) {
9959 Py_DECREF(sub);
9960 if (kind1 != kind) PyMem_Free(buf1);
9961 return -1;
9962 }
9963 len1 = PyUnicode_GET_LENGTH(str);
9964 len2 = PyUnicode_GET_LENGTH(sub);
9965
9966 switch(kind) {
9967 case PyUnicode_1BYTE_KIND:
9968 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9969 break;
9970 case PyUnicode_2BYTE_KIND:
9971 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9972 break;
9973 case PyUnicode_4BYTE_KIND:
9974 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9975 break;
9976 default:
9977 result = -1;
9978 assert(0);
9979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009980
9981 Py_DECREF(str);
9982 Py_DECREF(sub);
9983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 if (kind1 != kind)
9985 PyMem_Free(buf1);
9986 if (kind2 != kind)
9987 PyMem_Free(buf2);
9988
Guido van Rossum403d68b2000-03-13 15:55:09 +00009989 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009990}
9991
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992/* Concat to string or Unicode object giving a new Unicode object. */
9993
Alexander Belopolsky40018472011-02-26 01:02:56 +00009994PyObject *
9995PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 PyObject *u = NULL, *v = NULL, *w;
9998 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999
10000 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007
10008 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010009 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010010 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010013 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 }
10017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010019 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 w = PyUnicode_New(
10023 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10024 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010027 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10028 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010029 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010030 v, 0,
10031 PyUnicode_GET_LENGTH(v)) < 0)
10032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 Py_DECREF(u);
10034 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 Py_XDECREF(u);
10039 Py_XDECREF(v);
10040 return NULL;
10041}
10042
Victor Stinnerb0923652011-10-04 01:17:31 +020010043static void
10044unicode_append_inplace(PyObject **p_left, PyObject *right)
10045{
10046 Py_ssize_t left_len, right_len, new_len;
10047#ifdef Py_DEBUG
10048 Py_ssize_t copied;
10049#endif
10050
10051 assert(PyUnicode_IS_READY(*p_left));
10052 assert(PyUnicode_IS_READY(right));
10053
10054 left_len = PyUnicode_GET_LENGTH(*p_left);
10055 right_len = PyUnicode_GET_LENGTH(right);
10056 if (left_len > PY_SSIZE_T_MAX - right_len) {
10057 PyErr_SetString(PyExc_OverflowError,
10058 "strings are too large to concat");
10059 goto error;
10060 }
10061 new_len = left_len + right_len;
10062
10063 /* Now we own the last reference to 'left', so we can resize it
10064 * in-place.
10065 */
10066 if (unicode_resize(p_left, new_len) != 0) {
10067 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10068 * deallocated so it cannot be put back into
10069 * 'variable'. The MemoryError is raised when there
10070 * is no value in 'variable', which might (very
10071 * remotely) be a cause of incompatibilities.
10072 */
10073 goto error;
10074 }
10075 /* copy 'right' into the newly allocated area of 'left' */
10076#ifdef Py_DEBUG
10077 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10078 right, 0,
10079 right_len);
10080 assert(0 <= copied);
10081#else
10082 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10083#endif
10084 return;
10085
10086error:
10087 Py_DECREF(*p_left);
10088 *p_left = NULL;
10089}
10090
Walter Dörwald1ab83302007-05-18 17:15:44 +000010091void
Victor Stinner23e56682011-10-03 03:54:37 +020010092PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010093{
Victor Stinner23e56682011-10-03 03:54:37 +020010094 PyObject *left, *res;
10095
10096 if (p_left == NULL) {
10097 if (!PyErr_Occurred())
10098 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010099 return;
10100 }
Victor Stinner23e56682011-10-03 03:54:37 +020010101 left = *p_left;
10102 if (right == NULL || !PyUnicode_Check(left)) {
10103 if (!PyErr_Occurred())
10104 PyErr_BadInternalCall();
10105 goto error;
10106 }
10107
Victor Stinnere1335c72011-10-04 20:53:03 +020010108 if (PyUnicode_READY(left))
10109 goto error;
10110 if (PyUnicode_READY(right))
10111 goto error;
10112
Victor Stinner23e56682011-10-03 03:54:37 +020010113 if (PyUnicode_CheckExact(left) && left != unicode_empty
10114 && PyUnicode_CheckExact(right) && right != unicode_empty
10115 && unicode_resizable(left)
10116 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10117 || _PyUnicode_WSTR(left) != NULL))
10118 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010119 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10120 to change the structure size, but characters are stored just after
10121 the structure, and so it requires to move all charactres which is
10122 not so different than duplicating the string. */
10123 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010124 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010125 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010126 return;
10127 }
10128 }
10129
10130 res = PyUnicode_Concat(left, right);
10131 if (res == NULL)
10132 goto error;
10133 Py_DECREF(left);
10134 *p_left = res;
10135 return;
10136
10137error:
10138 Py_DECREF(*p_left);
10139 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010140}
10141
10142void
10143PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010145 PyUnicode_Append(pleft, right);
10146 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010147}
10148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010149PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010152Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010153string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010154interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
10156static PyObject *
10157unicode_count(PyUnicodeObject *self, PyObject *args)
10158{
10159 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010160 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010161 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 int kind1, kind2, kind;
10164 void *buf1, *buf2;
10165 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
Jesus Ceaac451502011-04-20 17:09:23 +020010167 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10168 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010169 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 kind1 = PyUnicode_KIND(self);
10172 kind2 = PyUnicode_KIND(substring);
10173 kind = kind1 > kind2 ? kind1 : kind2;
10174 buf1 = PyUnicode_DATA(self);
10175 buf2 = PyUnicode_DATA(substring);
10176 if (kind1 != kind)
10177 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10178 if (!buf1) {
10179 Py_DECREF(substring);
10180 return NULL;
10181 }
10182 if (kind2 != kind)
10183 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10184 if (!buf2) {
10185 Py_DECREF(substring);
10186 if (kind1 != kind) PyMem_Free(buf1);
10187 return NULL;
10188 }
10189 len1 = PyUnicode_GET_LENGTH(self);
10190 len2 = PyUnicode_GET_LENGTH(substring);
10191
10192 ADJUST_INDICES(start, end, len1);
10193 switch(kind) {
10194 case PyUnicode_1BYTE_KIND:
10195 iresult = ucs1lib_count(
10196 ((Py_UCS1*)buf1) + start, end - start,
10197 buf2, len2, PY_SSIZE_T_MAX
10198 );
10199 break;
10200 case PyUnicode_2BYTE_KIND:
10201 iresult = ucs2lib_count(
10202 ((Py_UCS2*)buf1) + start, end - start,
10203 buf2, len2, PY_SSIZE_T_MAX
10204 );
10205 break;
10206 case PyUnicode_4BYTE_KIND:
10207 iresult = ucs4lib_count(
10208 ((Py_UCS4*)buf1) + start, end - start,
10209 buf2, len2, PY_SSIZE_T_MAX
10210 );
10211 break;
10212 default:
10213 assert(0); iresult = 0;
10214 }
10215
10216 result = PyLong_FromSsize_t(iresult);
10217
10218 if (kind1 != kind)
10219 PyMem_Free(buf1);
10220 if (kind2 != kind)
10221 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
10223 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225 return result;
10226}
10227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010228PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010229 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010231Encode S using the codec registered for encoding. Default encoding\n\
10232is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010233handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010234a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10235'xmlcharrefreplace' as well as any other name registered with\n\
10236codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237
10238static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010239unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010241 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 char *encoding = NULL;
10243 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010244
Benjamin Peterson308d6372009-09-18 21:42:35 +000010245 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10246 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010248 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010249}
10250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010251PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253\n\
10254Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010255If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
10257static PyObject*
10258unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10259{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010260 Py_ssize_t i, j, line_pos, src_len, incr;
10261 Py_UCS4 ch;
10262 PyObject *u;
10263 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010265 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010266 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
10268 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
Antoine Pitrou22425222011-10-04 19:10:51 +020010271 if (PyUnicode_READY(self) == -1)
10272 return NULL;
10273
Thomas Wouters7e474022000-07-16 12:04:32 +000010274 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010275 src_len = PyUnicode_GET_LENGTH(self);
10276 i = j = line_pos = 0;
10277 kind = PyUnicode_KIND(self);
10278 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010279 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010280 for (; i < src_len; i++) {
10281 ch = PyUnicode_READ(kind, src_data, i);
10282 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010283 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010285 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010287 goto overflow;
10288 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010290 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010294 goto overflow;
10295 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010297 if (ch == '\n' || ch == '\r')
10298 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010300 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010301 if (!found && PyUnicode_CheckExact(self)) {
10302 Py_INCREF((PyObject *) self);
10303 return (PyObject *) self;
10304 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010305
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010307 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 if (!u)
10309 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010310 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311
Antoine Pitroue71d5742011-10-04 15:55:09 +020010312 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
Antoine Pitroue71d5742011-10-04 15:55:09 +020010314 for (; i < src_len; i++) {
10315 ch = PyUnicode_READ(kind, src_data, i);
10316 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010318 incr = tabsize - (line_pos % tabsize);
10319 line_pos += incr;
10320 while (incr--) {
10321 PyUnicode_WRITE(kind, dest_data, j, ' ');
10322 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010323 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010324 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010325 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010326 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010327 line_pos++;
10328 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010329 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010330 if (ch == '\n' || ch == '\r')
10331 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010333 }
10334 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010335#ifndef DONT_MAKE_RESULT_READY
10336 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 Py_DECREF(u);
10338 return NULL;
10339 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010340#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010342
Antoine Pitroue71d5742011-10-04 15:55:09 +020010343 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010344 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346}
10347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010348PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010349 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350\n\
10351Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010352such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353arguments start and end are interpreted as in slice notation.\n\
10354\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010355Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356
10357static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359{
Jesus Ceaac451502011-04-20 17:09:23 +020010360 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010361 Py_ssize_t start;
10362 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010363 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364
Jesus Ceaac451502011-04-20 17:09:23 +020010365 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10366 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 if (PyUnicode_READY(self) == -1)
10370 return NULL;
10371 if (PyUnicode_READY(substring) == -1)
10372 return NULL;
10373
10374 result = any_find_slice(
10375 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10376 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010377 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
10379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 if (result == -2)
10382 return NULL;
10383
Christian Heimes217cfd12007-12-02 14:31:20 +000010384 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385}
10386
10387static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010388unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010390 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10391 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394}
10395
Guido van Rossumc2504932007-09-18 19:42:40 +000010396/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010397 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010398static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010399unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400{
Guido van Rossumc2504932007-09-18 19:42:40 +000010401 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010402 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 if (_PyUnicode_HASH(self) != -1)
10405 return _PyUnicode_HASH(self);
10406 if (PyUnicode_READY(self) == -1)
10407 return -1;
10408 len = PyUnicode_GET_LENGTH(self);
10409
10410 /* The hash function as a macro, gets expanded three times below. */
10411#define HASH(P) \
10412 x = (Py_uhash_t)*P << 7; \
10413 while (--len >= 0) \
10414 x = (1000003*x) ^ (Py_uhash_t)*P++;
10415
10416 switch (PyUnicode_KIND(self)) {
10417 case PyUnicode_1BYTE_KIND: {
10418 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10419 HASH(c);
10420 break;
10421 }
10422 case PyUnicode_2BYTE_KIND: {
10423 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10424 HASH(s);
10425 break;
10426 }
10427 default: {
10428 Py_UCS4 *l;
10429 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10430 "Impossible switch case in unicode_hash");
10431 l = PyUnicode_4BYTE_DATA(self);
10432 HASH(l);
10433 break;
10434 }
10435 }
10436 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10437
Guido van Rossumc2504932007-09-18 19:42:40 +000010438 if (x == -1)
10439 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010441 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010445PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010448Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
10450static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010453 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010454 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010455 Py_ssize_t start;
10456 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
Jesus Ceaac451502011-04-20 17:09:23 +020010458 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10459 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_READY(self) == -1)
10463 return NULL;
10464 if (PyUnicode_READY(substring) == -1)
10465 return NULL;
10466
10467 result = any_find_slice(
10468 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10469 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010470 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
10472 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (result == -2)
10475 return NULL;
10476
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 if (result < 0) {
10478 PyErr_SetString(PyExc_ValueError, "substring not found");
10479 return NULL;
10480 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481
Christian Heimes217cfd12007-12-02 14:31:20 +000010482 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483}
10484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010485PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010486 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010488Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010489at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490
10491static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010492unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 Py_ssize_t i, length;
10495 int kind;
10496 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 int cased;
10498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (PyUnicode_READY(self) == -1)
10500 return NULL;
10501 length = PyUnicode_GET_LENGTH(self);
10502 kind = PyUnicode_KIND(self);
10503 data = PyUnicode_DATA(self);
10504
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (length == 1)
10507 return PyBool_FromLong(
10508 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010510 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010512 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010513
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 for (i = 0; i < length; i++) {
10516 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010517
Benjamin Peterson29060642009-01-31 22:14:21 +000010518 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10519 return PyBool_FromLong(0);
10520 else if (!cased && Py_UNICODE_ISLOWER(ch))
10521 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010523 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524}
10525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010526PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010529Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010530at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
10532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010533unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 Py_ssize_t i, length;
10536 int kind;
10537 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 int cased;
10539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
10542 length = PyUnicode_GET_LENGTH(self);
10543 kind = PyUnicode_KIND(self);
10544 data = PyUnicode_DATA(self);
10545
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (length == 1)
10548 return PyBool_FromLong(
10549 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010551 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010553 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010554
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 for (i = 0; i < length; i++) {
10557 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010558
Benjamin Peterson29060642009-01-31 22:14:21 +000010559 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10560 return PyBool_FromLong(0);
10561 else if (!cased && Py_UNICODE_ISUPPER(ch))
10562 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010564 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565}
10566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010567PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010570Return True if S is a titlecased string and there is at least one\n\
10571character in S, i.e. upper- and titlecase characters may only\n\
10572follow uncased characters and lowercase characters only cased ones.\n\
10573Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
10575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010576unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 Py_ssize_t i, length;
10579 int kind;
10580 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581 int cased, previous_is_cased;
10582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (PyUnicode_READY(self) == -1)
10584 return NULL;
10585 length = PyUnicode_GET_LENGTH(self);
10586 kind = PyUnicode_KIND(self);
10587 data = PyUnicode_DATA(self);
10588
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 if (length == 1) {
10591 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10592 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10593 (Py_UNICODE_ISUPPER(ch) != 0));
10594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010596 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010599
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 cased = 0;
10601 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 for (i = 0; i < length; i++) {
10603 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010604
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10606 if (previous_is_cased)
10607 return PyBool_FromLong(0);
10608 previous_is_cased = 1;
10609 cased = 1;
10610 }
10611 else if (Py_UNICODE_ISLOWER(ch)) {
10612 if (!previous_is_cased)
10613 return PyBool_FromLong(0);
10614 previous_is_cased = 1;
10615 cased = 1;
10616 }
10617 else
10618 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010620 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621}
10622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010623PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010626Return True if all characters in S are whitespace\n\
10627and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
10629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010630unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 Py_ssize_t i, length;
10633 int kind;
10634 void *data;
10635
10636 if (PyUnicode_READY(self) == -1)
10637 return NULL;
10638 length = PyUnicode_GET_LENGTH(self);
10639 kind = PyUnicode_KIND(self);
10640 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (length == 1)
10644 return PyBool_FromLong(
10645 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010647 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 for (i = 0; i < length; i++) {
10652 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010653 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010656 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657}
10658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010659PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010660 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010661\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010662Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010663and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010664
10665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010666unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 Py_ssize_t i, length;
10669 int kind;
10670 void *data;
10671
10672 if (PyUnicode_READY(self) == -1)
10673 return NULL;
10674 length = PyUnicode_GET_LENGTH(self);
10675 kind = PyUnicode_KIND(self);
10676 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010677
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (length == 1)
10680 return PyBool_FromLong(
10681 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010682
10683 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 for (i = 0; i < length; i++) {
10688 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010690 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010691 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010692}
10693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010694PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010696\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010697Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010698and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010699
10700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010701unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 int kind;
10704 void *data;
10705 Py_ssize_t len, i;
10706
10707 if (PyUnicode_READY(self) == -1)
10708 return NULL;
10709
10710 kind = PyUnicode_KIND(self);
10711 data = PyUnicode_DATA(self);
10712 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010713
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010714 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (len == 1) {
10716 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10717 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10718 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010719
10720 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 for (i = 0; i < len; i++) {
10725 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010726 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010728 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010729 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010730}
10731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010732PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010733 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010735Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010736False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
10738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010739unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 Py_ssize_t i, length;
10742 int kind;
10743 void *data;
10744
10745 if (PyUnicode_READY(self) == -1)
10746 return NULL;
10747 length = PyUnicode_GET_LENGTH(self);
10748 kind = PyUnicode_KIND(self);
10749 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (length == 1)
10753 return PyBool_FromLong(
10754 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010756 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010758 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 for (i = 0; i < length; i++) {
10761 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010764 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765}
10766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010767PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010770Return True if all characters in S are digits\n\
10771and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
10773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010774unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 Py_ssize_t i, length;
10777 int kind;
10778 void *data;
10779
10780 if (PyUnicode_READY(self) == -1)
10781 return NULL;
10782 length = PyUnicode_GET_LENGTH(self);
10783 kind = PyUnicode_KIND(self);
10784 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 if (length == 1) {
10788 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10789 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010792 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 for (i = 0; i < length; i++) {
10797 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010800 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801}
10802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010803PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010806Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
10809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010810unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 Py_ssize_t i, length;
10813 int kind;
10814 void *data;
10815
10816 if (PyUnicode_READY(self) == -1)
10817 return NULL;
10818 length = PyUnicode_GET_LENGTH(self);
10819 kind = PyUnicode_KIND(self);
10820 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 if (length == 1)
10824 return PyBool_FromLong(
10825 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010827 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 for (i = 0; i < length; i++) {
10832 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010835 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836}
10837
Martin v. Löwis47383402007-08-15 07:32:56 +000010838int
10839PyUnicode_IsIdentifier(PyObject *self)
10840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 int kind;
10842 void *data;
10843 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010844 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (PyUnicode_READY(self) == -1) {
10847 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 }
10850
10851 /* Special case for empty strings */
10852 if (PyUnicode_GET_LENGTH(self) == 0)
10853 return 0;
10854 kind = PyUnicode_KIND(self);
10855 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010856
10857 /* PEP 3131 says that the first character must be in
10858 XID_Start and subsequent characters in XID_Continue,
10859 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010860 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010861 letters, digits, underscore). However, given the current
10862 definition of XID_Start and XID_Continue, it is sufficient
10863 to check just for these, except that _ must be allowed
10864 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010866 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010867 return 0;
10868
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010869 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010872 return 1;
10873}
10874
10875PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010877\n\
10878Return True if S is a valid identifier according\n\
10879to the language definition.");
10880
10881static PyObject*
10882unicode_isidentifier(PyObject *self)
10883{
10884 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10885}
10886
Georg Brandl559e5d72008-06-11 18:37:52 +000010887PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010889\n\
10890Return True if all characters in S are considered\n\
10891printable in repr() or S is empty, False otherwise.");
10892
10893static PyObject*
10894unicode_isprintable(PyObject *self)
10895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 Py_ssize_t i, length;
10897 int kind;
10898 void *data;
10899
10900 if (PyUnicode_READY(self) == -1)
10901 return NULL;
10902 length = PyUnicode_GET_LENGTH(self);
10903 kind = PyUnicode_KIND(self);
10904 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010905
10906 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (length == 1)
10908 return PyBool_FromLong(
10909 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 for (i = 0; i < length; i++) {
10912 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010913 Py_RETURN_FALSE;
10914 }
10915 }
10916 Py_RETURN_TRUE;
10917}
10918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010919PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010920 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921\n\
10922Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010923iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010926unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010928 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
Martin v. Löwis18e16552006-02-15 17:27:45 +000010931static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932unicode_length(PyUnicodeObject *self)
10933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (PyUnicode_READY(self) == -1)
10935 return -1;
10936 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937}
10938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010942Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010943done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject *
10946unicode_ljust(PyUnicodeObject *self, PyObject *args)
10947{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010948 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_UCS4 fillchar = ' ';
10950
10951 if (PyUnicode_READY(self) == -1)
10952 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010953
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010954 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 return NULL;
10956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 Py_INCREF(self);
10959 return (PyObject*) self;
10960 }
10961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963}
10964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010966 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010968Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010971unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 return fixup(self, fixlower);
10974}
10975
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010976#define LEFTSTRIP 0
10977#define RIGHTSTRIP 1
10978#define BOTHSTRIP 2
10979
10980/* Arrays indexed by above */
10981static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10982
10983#define STRIPNAME(i) (stripformat[i]+3)
10984
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010985/* externally visible for str.strip(unicode) */
10986PyObject *
10987_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 void *data;
10990 int kind;
10991 Py_ssize_t i, j, len;
10992 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10995 return NULL;
10996
10997 kind = PyUnicode_KIND(self);
10998 data = PyUnicode_DATA(self);
10999 len = PyUnicode_GET_LENGTH(self);
11000 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11001 PyUnicode_DATA(sepobj),
11002 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011003
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 i = 0;
11005 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 while (i < len &&
11007 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 i++;
11009 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011010 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011011
Benjamin Peterson14339b62009-01-31 16:36:08 +000011012 j = len;
11013 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 do {
11015 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 } while (j >= i &&
11017 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011020
Victor Stinner12bab6d2011-10-01 01:53:49 +020011021 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022}
11023
11024PyObject*
11025PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11026{
11027 unsigned char *data;
11028 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011029 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030
Victor Stinnerde636f32011-10-01 03:55:54 +020011031 if (PyUnicode_READY(self) == -1)
11032 return NULL;
11033
11034 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11035
Victor Stinner12bab6d2011-10-01 01:53:49 +020011036 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011038 if (PyUnicode_CheckExact(self)) {
11039 Py_INCREF(self);
11040 return self;
11041 }
11042 else
11043 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 }
11045
Victor Stinner12bab6d2011-10-01 01:53:49 +020011046 length = end - start;
11047 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011048 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049
Victor Stinnerde636f32011-10-01 03:55:54 +020011050 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011051 PyErr_SetString(PyExc_IndexError, "string index out of range");
11052 return NULL;
11053 }
11054
Victor Stinnerb9275c12011-10-05 14:01:42 +020011055 if (PyUnicode_IS_ASCII(self)) {
11056 kind = PyUnicode_KIND(self);
11057 data = PyUnicode_1BYTE_DATA(self);
11058 return unicode_fromascii(data + start, length);
11059 }
11060 else {
11061 kind = PyUnicode_KIND(self);
11062 data = PyUnicode_1BYTE_DATA(self);
11063 return PyUnicode_FromKindAndData(kind,
11064 data + PyUnicode_KIND_SIZE(kind, start),
11065 length);
11066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
11069static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011070do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 int kind;
11073 void *data;
11074 Py_ssize_t len, i, j;
11075
11076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078
11079 kind = PyUnicode_KIND(self);
11080 data = PyUnicode_DATA(self);
11081 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011082
Benjamin Peterson14339b62009-01-31 16:36:08 +000011083 i = 0;
11084 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011086 i++;
11087 }
11088 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011089
Benjamin Peterson14339b62009-01-31 16:36:08 +000011090 j = len;
11091 if (striptype != LEFTSTRIP) {
11092 do {
11093 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011095 j++;
11096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011097
Victor Stinner12bab6d2011-10-01 01:53:49 +020011098 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099}
11100
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011101
11102static PyObject *
11103do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011105 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011106
Benjamin Peterson14339b62009-01-31 16:36:08 +000011107 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11108 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011109
Benjamin Peterson14339b62009-01-31 16:36:08 +000011110 if (sep != NULL && sep != Py_None) {
11111 if (PyUnicode_Check(sep))
11112 return _PyUnicode_XStrip(self, striptype, sep);
11113 else {
11114 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 "%s arg must be None or str",
11116 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011117 return NULL;
11118 }
11119 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011120
Benjamin Peterson14339b62009-01-31 16:36:08 +000011121 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011122}
11123
11124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011127\n\
11128Return a copy of the string S with leading and trailing\n\
11129whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011130If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011131
11132static PyObject *
11133unicode_strip(PyUnicodeObject *self, PyObject *args)
11134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011135 if (PyTuple_GET_SIZE(args) == 0)
11136 return do_strip(self, BOTHSTRIP); /* Common case */
11137 else
11138 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011139}
11140
11141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011142PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011144\n\
11145Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011146If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011147
11148static PyObject *
11149unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011151 if (PyTuple_GET_SIZE(args) == 0)
11152 return do_strip(self, LEFTSTRIP); /* Common case */
11153 else
11154 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011155}
11156
11157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011159 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011160\n\
11161Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011163
11164static PyObject *
11165unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011167 if (PyTuple_GET_SIZE(args) == 0)
11168 return do_strip(self, RIGHTSTRIP); /* Common case */
11169 else
11170 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011171}
11172
11173
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011175unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176{
11177 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179
Georg Brandl222de0f2009-04-12 12:01:50 +000011180 if (len < 1) {
11181 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011182 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Tim Peters7a29bd52001-09-12 03:03:31 +000011185 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186 /* no repeat, return original string */
11187 Py_INCREF(str);
11188 return (PyObject*) str;
11189 }
Tim Peters8f422462000-09-09 06:13:41 +000011190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 if (PyUnicode_READY(str) == -1)
11192 return NULL;
11193
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011194 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011195 PyErr_SetString(PyExc_OverflowError,
11196 "repeated string is too long");
11197 return NULL;
11198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 if (!u)
11203 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011204 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 if (PyUnicode_GET_LENGTH(str) == 1) {
11207 const int kind = PyUnicode_KIND(str);
11208 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11209 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011210 if (kind == PyUnicode_1BYTE_KIND)
11211 memset(to, (unsigned char)fill_char, len);
11212 else {
11213 for (n = 0; n < len; ++n)
11214 PyUnicode_WRITE(kind, to, n, fill_char);
11215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 }
11217 else {
11218 /* number of characters copied this far */
11219 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11220 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11221 char *to = (char *) PyUnicode_DATA(u);
11222 Py_MEMCPY(to, PyUnicode_DATA(str),
11223 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 n = (done <= nchars-done) ? done : nchars-done;
11226 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 }
11230
11231 return (PyObject*) u;
11232}
11233
Alexander Belopolsky40018472011-02-26 01:02:56 +000011234PyObject *
11235PyUnicode_Replace(PyObject *obj,
11236 PyObject *subobj,
11237 PyObject *replobj,
11238 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239{
11240 PyObject *self;
11241 PyObject *str1;
11242 PyObject *str2;
11243 PyObject *result;
11244
11245 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011246 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011249 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 Py_DECREF(self);
11251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 }
11253 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011254 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 Py_DECREF(self);
11256 Py_DECREF(str1);
11257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 Py_DECREF(self);
11261 Py_DECREF(str1);
11262 Py_DECREF(str2);
11263 return result;
11264}
11265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011267 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268\n\
11269Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011270old replaced by new. If the optional argument count is\n\
11271given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 PyObject *str1;
11277 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011278 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 PyObject *result;
11280
Martin v. Löwis18e16552006-02-15 17:27:45 +000011281 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 str1 = PyUnicode_FromObject(str1);
11286 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11287 return NULL;
11288 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011289 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 Py_DECREF(str1);
11291 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
11294 result = replace(self, str1, str2, maxcount);
11295
11296 Py_DECREF(str1);
11297 Py_DECREF(str2);
11298 return result;
11299}
11300
Alexander Belopolsky40018472011-02-26 01:02:56 +000011301static PyObject *
11302unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011304 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 Py_ssize_t isize;
11306 Py_ssize_t osize, squote, dquote, i, o;
11307 Py_UCS4 max, quote;
11308 int ikind, okind;
11309 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011312 return NULL;
11313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 isize = PyUnicode_GET_LENGTH(unicode);
11315 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 /* Compute length of output, quote characters, and
11318 maximum character */
11319 osize = 2; /* quotes */
11320 max = 127;
11321 squote = dquote = 0;
11322 ikind = PyUnicode_KIND(unicode);
11323 for (i = 0; i < isize; i++) {
11324 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11325 switch (ch) {
11326 case '\'': squote++; osize++; break;
11327 case '"': dquote++; osize++; break;
11328 case '\\': case '\t': case '\r': case '\n':
11329 osize += 2; break;
11330 default:
11331 /* Fast-path ASCII */
11332 if (ch < ' ' || ch == 0x7f)
11333 osize += 4; /* \xHH */
11334 else if (ch < 0x7f)
11335 osize++;
11336 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11337 osize++;
11338 max = ch > max ? ch : max;
11339 }
11340 else if (ch < 0x100)
11341 osize += 4; /* \xHH */
11342 else if (ch < 0x10000)
11343 osize += 6; /* \uHHHH */
11344 else
11345 osize += 10; /* \uHHHHHHHH */
11346 }
11347 }
11348
11349 quote = '\'';
11350 if (squote) {
11351 if (dquote)
11352 /* Both squote and dquote present. Use squote,
11353 and escape them */
11354 osize += squote;
11355 else
11356 quote = '"';
11357 }
11358
11359 repr = PyUnicode_New(osize, max);
11360 if (repr == NULL)
11361 return NULL;
11362 okind = PyUnicode_KIND(repr);
11363 odata = PyUnicode_DATA(repr);
11364
11365 PyUnicode_WRITE(okind, odata, 0, quote);
11366 PyUnicode_WRITE(okind, odata, osize-1, quote);
11367
11368 for (i = 0, o = 1; i < isize; i++) {
11369 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011370
11371 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 if ((ch == quote) || (ch == '\\')) {
11373 PyUnicode_WRITE(okind, odata, o++, '\\');
11374 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011375 continue;
11376 }
11377
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011379 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 PyUnicode_WRITE(okind, odata, o++, '\\');
11381 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011382 }
11383 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 PyUnicode_WRITE(okind, odata, o++, '\\');
11385 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011386 }
11387 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 PyUnicode_WRITE(okind, odata, o++, '\\');
11389 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011390 }
11391
11392 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011393 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 PyUnicode_WRITE(okind, odata, o++, '\\');
11395 PyUnicode_WRITE(okind, odata, o++, 'x');
11396 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11397 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011398 }
11399
Georg Brandl559e5d72008-06-11 18:37:52 +000011400 /* Copy ASCII characters as-is */
11401 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011403 }
11404
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011406 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011407 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011408 (categories Z* and C* except ASCII space)
11409 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011411 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (ch <= 0xff) {
11413 PyUnicode_WRITE(okind, odata, o++, '\\');
11414 PyUnicode_WRITE(okind, odata, o++, 'x');
11415 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11416 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011417 }
11418 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 else if (ch >= 0x10000) {
11420 PyUnicode_WRITE(okind, odata, o++, '\\');
11421 PyUnicode_WRITE(okind, odata, o++, 'U');
11422 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11423 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11424 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11425 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11426 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11427 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11428 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11429 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011430 }
11431 /* Map 16-bit characters to '\uxxxx' */
11432 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 PyUnicode_WRITE(okind, odata, o++, '\\');
11434 PyUnicode_WRITE(okind, odata, o++, 'u');
11435 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11436 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11437 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11438 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011439 }
11440 }
11441 /* Copy characters as-is */
11442 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011444 }
11445 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011448 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449}
11450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011451PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453\n\
11454Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011455such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456arguments start and end are interpreted as in slice notation.\n\
11457\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
11460static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462{
Jesus Ceaac451502011-04-20 17:09:23 +020011463 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011464 Py_ssize_t start;
11465 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011466 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
Jesus Ceaac451502011-04-20 17:09:23 +020011468 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11469 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 if (PyUnicode_READY(self) == -1)
11473 return NULL;
11474 if (PyUnicode_READY(substring) == -1)
11475 return NULL;
11476
11477 result = any_find_slice(
11478 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11479 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011480 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (result == -2)
11485 return NULL;
11486
Christian Heimes217cfd12007-12-02 14:31:20 +000011487 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
11495static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497{
Jesus Ceaac451502011-04-20 17:09:23 +020011498 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011499 Py_ssize_t start;
11500 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Jesus Ceaac451502011-04-20 17:09:23 +020011503 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11504 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 if (PyUnicode_READY(self) == -1)
11508 return NULL;
11509 if (PyUnicode_READY(substring) == -1)
11510 return NULL;
11511
11512 result = any_find_slice(
11513 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11514 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 if (result == -2)
11520 return NULL;
11521
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 if (result < 0) {
11523 PyErr_SetString(PyExc_ValueError, "substring not found");
11524 return NULL;
11525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526
Christian Heimes217cfd12007-12-02 14:31:20 +000011527 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011530PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011533Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011534done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
11536static PyObject *
11537unicode_rjust(PyUnicodeObject *self, PyObject *args)
11538{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011539 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 Py_UCS4 fillchar = ' ';
11541
Victor Stinnere9a29352011-10-01 02:14:59 +020011542 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011544
Victor Stinnere9a29352011-10-01 02:14:59 +020011545 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 return NULL;
11547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549 Py_INCREF(self);
11550 return (PyObject*) self;
11551 }
11552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Alexander Belopolsky40018472011-02-26 01:02:56 +000011556PyObject *
11557PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
11559 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011560
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 s = PyUnicode_FromObject(s);
11562 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011563 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 if (sep != NULL) {
11565 sep = PyUnicode_FromObject(sep);
11566 if (sep == NULL) {
11567 Py_DECREF(s);
11568 return NULL;
11569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 }
11571
11572 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11573
11574 Py_DECREF(s);
11575 Py_XDECREF(sep);
11576 return result;
11577}
11578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011579PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581\n\
11582Return a list of the words in S, using sep as the\n\
11583delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011584splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011585whitespace string is a separator and empty strings are\n\
11586removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject*
11589unicode_split(PyUnicodeObject *self, PyObject *args)
11590{
11591 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011592 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
Martin v. Löwis18e16552006-02-15 17:27:45 +000011594 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 return NULL;
11596
11597 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Thomas Wouters477c8d52006-05-27 19:21:47 +000011605PyObject *
11606PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11607{
11608 PyObject* str_obj;
11609 PyObject* sep_obj;
11610 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 int kind1, kind2, kind;
11612 void *buf1 = NULL, *buf2 = NULL;
11613 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011614
11615 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011616 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011618 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 Py_DECREF(str_obj);
11621 return NULL;
11622 }
11623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 kind1 = PyUnicode_KIND(str_in);
11625 kind2 = PyUnicode_KIND(sep_obj);
11626 kind = kind1 > kind2 ? kind1 : kind2;
11627 buf1 = PyUnicode_DATA(str_in);
11628 if (kind1 != kind)
11629 buf1 = _PyUnicode_AsKind(str_in, kind);
11630 if (!buf1)
11631 goto onError;
11632 buf2 = PyUnicode_DATA(sep_obj);
11633 if (kind2 != kind)
11634 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11635 if (!buf2)
11636 goto onError;
11637 len1 = PyUnicode_GET_LENGTH(str_obj);
11638 len2 = PyUnicode_GET_LENGTH(sep_obj);
11639
11640 switch(PyUnicode_KIND(str_in)) {
11641 case PyUnicode_1BYTE_KIND:
11642 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11643 break;
11644 case PyUnicode_2BYTE_KIND:
11645 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11646 break;
11647 case PyUnicode_4BYTE_KIND:
11648 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11649 break;
11650 default:
11651 assert(0);
11652 out = 0;
11653 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654
11655 Py_DECREF(sep_obj);
11656 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 if (kind1 != kind)
11658 PyMem_Free(buf1);
11659 if (kind2 != kind)
11660 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011661
11662 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 onError:
11664 Py_DECREF(sep_obj);
11665 Py_DECREF(str_obj);
11666 if (kind1 != kind && buf1)
11667 PyMem_Free(buf1);
11668 if (kind2 != kind && buf2)
11669 PyMem_Free(buf2);
11670 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011671}
11672
11673
11674PyObject *
11675PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11676{
11677 PyObject* str_obj;
11678 PyObject* sep_obj;
11679 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 int kind1, kind2, kind;
11681 void *buf1 = NULL, *buf2 = NULL;
11682 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011683
11684 str_obj = PyUnicode_FromObject(str_in);
11685 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687 sep_obj = PyUnicode_FromObject(sep_in);
11688 if (!sep_obj) {
11689 Py_DECREF(str_obj);
11690 return NULL;
11691 }
11692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 kind1 = PyUnicode_KIND(str_in);
11694 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011695 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 buf1 = PyUnicode_DATA(str_in);
11697 if (kind1 != kind)
11698 buf1 = _PyUnicode_AsKind(str_in, kind);
11699 if (!buf1)
11700 goto onError;
11701 buf2 = PyUnicode_DATA(sep_obj);
11702 if (kind2 != kind)
11703 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11704 if (!buf2)
11705 goto onError;
11706 len1 = PyUnicode_GET_LENGTH(str_obj);
11707 len2 = PyUnicode_GET_LENGTH(sep_obj);
11708
11709 switch(PyUnicode_KIND(str_in)) {
11710 case PyUnicode_1BYTE_KIND:
11711 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11712 break;
11713 case PyUnicode_2BYTE_KIND:
11714 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11715 break;
11716 case PyUnicode_4BYTE_KIND:
11717 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11718 break;
11719 default:
11720 assert(0);
11721 out = 0;
11722 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011723
11724 Py_DECREF(sep_obj);
11725 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 if (kind1 != kind)
11727 PyMem_Free(buf1);
11728 if (kind2 != kind)
11729 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011730
11731 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 onError:
11733 Py_DECREF(sep_obj);
11734 Py_DECREF(str_obj);
11735 if (kind1 != kind && buf1)
11736 PyMem_Free(buf1);
11737 if (kind2 != kind && buf2)
11738 PyMem_Free(buf2);
11739 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011740}
11741
11742PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011744\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011745Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011747found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011748
11749static PyObject*
11750unicode_partition(PyUnicodeObject *self, PyObject *separator)
11751{
11752 return PyUnicode_Partition((PyObject *)self, separator);
11753}
11754
11755PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011756 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011757\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011758Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011759the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011760separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761
11762static PyObject*
11763unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11764{
11765 return PyUnicode_RPartition((PyObject *)self, separator);
11766}
11767
Alexander Belopolsky40018472011-02-26 01:02:56 +000011768PyObject *
11769PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011770{
11771 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011772
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011773 s = PyUnicode_FromObject(s);
11774 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 if (sep != NULL) {
11777 sep = PyUnicode_FromObject(sep);
11778 if (sep == NULL) {
11779 Py_DECREF(s);
11780 return NULL;
11781 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011782 }
11783
11784 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11785
11786 Py_DECREF(s);
11787 Py_XDECREF(sep);
11788 return result;
11789}
11790
11791PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011793\n\
11794Return a list of the words in S, using sep as the\n\
11795delimiter string, starting at the end of the string and\n\
11796working to the front. If maxsplit is given, at most maxsplit\n\
11797splits are done. If sep is not specified, any whitespace string\n\
11798is a separator.");
11799
11800static PyObject*
11801unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11802{
11803 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011804 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011805
Martin v. Löwis18e16552006-02-15 17:27:45 +000011806 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011807 return NULL;
11808
11809 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011811 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011813 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011815}
11816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011817PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819\n\
11820Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011821Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011822is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
11824static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011825unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011827 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011828 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011830 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11831 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 return NULL;
11833
Guido van Rossum86662912000-04-11 15:38:46 +000011834 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835}
11836
11837static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011838PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Walter Dörwald346737f2007-05-31 10:44:43 +000011840 if (PyUnicode_CheckExact(self)) {
11841 Py_INCREF(self);
11842 return self;
11843 } else
11844 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011845 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846}
11847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850\n\
11851Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
11854static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011855unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 return fixup(self, fixswapcase);
11858}
11859
Georg Brandlceee0772007-11-27 23:48:05 +000011860PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011862\n\
11863Return a translation table usable for str.translate().\n\
11864If there is only one argument, it must be a dictionary mapping Unicode\n\
11865ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011866Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011867If there are two arguments, they must be strings of equal length, and\n\
11868in the resulting dictionary, each character in x will be mapped to the\n\
11869character at the same position in y. If there is a third argument, it\n\
11870must be a string, whose characters will be mapped to None in the result.");
11871
11872static PyObject*
11873unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11874{
11875 PyObject *x, *y = NULL, *z = NULL;
11876 PyObject *new = NULL, *key, *value;
11877 Py_ssize_t i = 0;
11878 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011879
Georg Brandlceee0772007-11-27 23:48:05 +000011880 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11881 return NULL;
11882 new = PyDict_New();
11883 if (!new)
11884 return NULL;
11885 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 int x_kind, y_kind, z_kind;
11887 void *x_data, *y_data, *z_data;
11888
Georg Brandlceee0772007-11-27 23:48:05 +000011889 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011890 if (!PyUnicode_Check(x)) {
11891 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11892 "be a string if there is a second argument");
11893 goto err;
11894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011896 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11897 "arguments must have equal length");
11898 goto err;
11899 }
11900 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 x_kind = PyUnicode_KIND(x);
11902 y_kind = PyUnicode_KIND(y);
11903 x_data = PyUnicode_DATA(x);
11904 y_data = PyUnicode_DATA(y);
11905 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11906 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11907 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011908 if (!key || !value)
11909 goto err;
11910 res = PyDict_SetItem(new, key, value);
11911 Py_DECREF(key);
11912 Py_DECREF(value);
11913 if (res < 0)
11914 goto err;
11915 }
11916 /* create entries for deleting chars in z */
11917 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 z_kind = PyUnicode_KIND(z);
11919 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011920 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011922 if (!key)
11923 goto err;
11924 res = PyDict_SetItem(new, key, Py_None);
11925 Py_DECREF(key);
11926 if (res < 0)
11927 goto err;
11928 }
11929 }
11930 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 int kind;
11932 void *data;
11933
Georg Brandlceee0772007-11-27 23:48:05 +000011934 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011935 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011936 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11937 "to maketrans it must be a dict");
11938 goto err;
11939 }
11940 /* copy entries into the new dict, converting string keys to int keys */
11941 while (PyDict_Next(x, &i, &key, &value)) {
11942 if (PyUnicode_Check(key)) {
11943 /* convert string keys to integer keys */
11944 PyObject *newkey;
11945 if (PyUnicode_GET_SIZE(key) != 1) {
11946 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11947 "table must be of length 1");
11948 goto err;
11949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 kind = PyUnicode_KIND(key);
11951 data = PyUnicode_DATA(key);
11952 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011953 if (!newkey)
11954 goto err;
11955 res = PyDict_SetItem(new, newkey, value);
11956 Py_DECREF(newkey);
11957 if (res < 0)
11958 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011959 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011960 /* just keep integer keys */
11961 if (PyDict_SetItem(new, key, value) < 0)
11962 goto err;
11963 } else {
11964 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11965 "be strings or integers");
11966 goto err;
11967 }
11968 }
11969 }
11970 return new;
11971 err:
11972 Py_DECREF(new);
11973 return NULL;
11974}
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978\n\
11979Return a copy of the string S, where all characters have been mapped\n\
11980through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011981Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011982Unmapped characters are left untouched. Characters mapped to None\n\
11983are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989}
11990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011991PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
11996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011997unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 return fixup(self, fixupper);
12000}
12001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012005Pad a numeric string S with zeros on the left, to fill a field\n\
12006of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008static PyObject *
12009unicode_zfill(PyUnicodeObject *self, PyObject *args)
12010{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012011 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012013 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 int kind;
12015 void *data;
12016 Py_UCS4 chr;
12017
12018 if (PyUnicode_READY(self) == -1)
12019 return NULL;
12020
Martin v. Löwis18e16552006-02-15 17:27:45 +000012021 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 return NULL;
12023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012025 if (PyUnicode_CheckExact(self)) {
12026 Py_INCREF(self);
12027 return (PyObject*) self;
12028 }
12029 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012030 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031 }
12032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035 u = pad(self, fill, 0, '0');
12036
Walter Dörwald068325e2002-04-15 13:36:47 +000012037 if (u == NULL)
12038 return NULL;
12039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 kind = PyUnicode_KIND(u);
12041 data = PyUnicode_DATA(u);
12042 chr = PyUnicode_READ(kind, data, fill);
12043
12044 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 PyUnicode_WRITE(kind, data, 0, chr);
12047 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048 }
12049
12050 return (PyObject*) u;
12051}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
12053#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012054static PyObject *
12055unicode__decimal2ascii(PyObject *self)
12056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012058}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059#endif
12060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012061PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012064Return True if S starts with the specified prefix, False otherwise.\n\
12065With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012066With optional end, stop comparing S at that position.\n\
12067prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069static PyObject *
12070unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012073 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012075 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012076 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012077 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Jesus Ceaac451502011-04-20 17:09:23 +020012079 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012081 if (PyTuple_Check(subobj)) {
12082 Py_ssize_t i;
12083 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012086 if (substring == NULL)
12087 return NULL;
12088 result = tailmatch(self, substring, start, end, -1);
12089 Py_DECREF(substring);
12090 if (result) {
12091 Py_RETURN_TRUE;
12092 }
12093 }
12094 /* nothing matched */
12095 Py_RETURN_FALSE;
12096 }
12097 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012098 if (substring == NULL) {
12099 if (PyErr_ExceptionMatches(PyExc_TypeError))
12100 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12101 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012103 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012104 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012106 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107}
12108
12109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012110PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012113Return True if S ends with the specified suffix, False otherwise.\n\
12114With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012115With optional end, stop comparing S at that position.\n\
12116suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
12118static PyObject *
12119unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012122 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012124 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012125 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012126 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Jesus Ceaac451502011-04-20 17:09:23 +020012128 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012130 if (PyTuple_Check(subobj)) {
12131 Py_ssize_t i;
12132 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12133 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012135 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012137 result = tailmatch(self, substring, start, end, +1);
12138 Py_DECREF(substring);
12139 if (result) {
12140 Py_RETURN_TRUE;
12141 }
12142 }
12143 Py_RETURN_FALSE;
12144 }
12145 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012146 if (substring == NULL) {
12147 if (PyErr_ExceptionMatches(PyExc_TypeError))
12148 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12149 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012151 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012152 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012154 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155}
12156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012158
12159PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012161\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012162Return a formatted version of S, using substitutions from args and kwargs.\n\
12163The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012164
Eric Smith27bbca62010-11-04 17:06:58 +000012165PyDoc_STRVAR(format_map__doc__,
12166 "S.format_map(mapping) -> str\n\
12167\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012168Return a formatted version of S, using substitutions from mapping.\n\
12169The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012170
Eric Smith4a7d76d2008-05-30 18:10:19 +000012171static PyObject *
12172unicode__format__(PyObject* self, PyObject* args)
12173{
12174 PyObject *format_spec;
12175
12176 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12177 return NULL;
12178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12180 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012181}
12182
Eric Smith8c663262007-08-25 02:26:07 +000012183PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012185\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012186Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012187
12188static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012189unicode__sizeof__(PyUnicodeObject *v)
12190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 Py_ssize_t size;
12192
12193 /* If it's a compact object, account for base structure +
12194 character data. */
12195 if (PyUnicode_IS_COMPACT_ASCII(v))
12196 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12197 else if (PyUnicode_IS_COMPACT(v))
12198 size = sizeof(PyCompactUnicodeObject) +
12199 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12200 else {
12201 /* If it is a two-block object, account for base object, and
12202 for character block if present. */
12203 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012204 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 size += (PyUnicode_GET_LENGTH(v) + 1) *
12206 PyUnicode_CHARACTER_SIZE(v);
12207 }
12208 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012209 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012210 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012212 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012213 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214
12215 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012216}
12217
12218PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012220
12221static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012222unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012223{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012224 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (!copy)
12226 return NULL;
12227 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012228}
12229
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230static PyMethodDef unicode_methods[] = {
12231
12232 /* Order is according to common usage: often used methods should
12233 appear first, since lookup is done sequentially. */
12234
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012235 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012236 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12237 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012238 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012239 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12240 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12241 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12242 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12243 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12244 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12245 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012247 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12248 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12249 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012250 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012251 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12252 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12253 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012254 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012256 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012257 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012258 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12259 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12260 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12261 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12262 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12263 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12264 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12265 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12266 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12267 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12268 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12269 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12270 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12271 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012272 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012273 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012274 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012275 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012276 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012277 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012278 {"maketrans", (PyCFunction) unicode_maketrans,
12279 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012280 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012281#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012282 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283#endif
12284
12285#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012286 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012287 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288#endif
12289
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 {NULL, NULL}
12292};
12293
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012294static PyObject *
12295unicode_mod(PyObject *v, PyObject *w)
12296{
Brian Curtindfc80e32011-08-10 20:28:54 -050012297 if (!PyUnicode_Check(v))
12298 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012300}
12301
12302static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 0, /*nb_add*/
12304 0, /*nb_subtract*/
12305 0, /*nb_multiply*/
12306 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012307};
12308
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012310 (lenfunc) unicode_length, /* sq_length */
12311 PyUnicode_Concat, /* sq_concat */
12312 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12313 (ssizeargfunc) unicode_getitem, /* sq_item */
12314 0, /* sq_slice */
12315 0, /* sq_ass_item */
12316 0, /* sq_ass_slice */
12317 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318};
12319
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012320static PyObject*
12321unicode_subscript(PyUnicodeObject* self, PyObject* item)
12322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (PyUnicode_READY(self) == -1)
12324 return NULL;
12325
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012326 if (PyIndex_Check(item)) {
12327 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012328 if (i == -1 && PyErr_Occurred())
12329 return NULL;
12330 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012332 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012333 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012334 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012335 PyObject *result;
12336 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012337 int src_kind, dest_kind;
12338 Py_UCS4 ch, max_char;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012342 return NULL;
12343 }
12344
12345 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 return PyUnicode_New(0, 0);
12347 } else if (start == 0 && step == 1 &&
12348 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012349 PyUnicode_CheckExact(self)) {
12350 Py_INCREF(self);
12351 return (PyObject *)self;
12352 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012353 return PyUnicode_Substring((PyObject*)self,
12354 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012355 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012356 /* General case */
12357 max_char = 127;
12358 src_kind = PyUnicode_KIND(self);
12359 src_data = PyUnicode_DATA(self);
12360 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12361 ch = PyUnicode_READ(src_kind, src_data, cur);
12362 if (ch > max_char)
12363 max_char = ch;
12364 }
12365 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012366 if (result == NULL)
12367 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012368 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012369 dest_data = PyUnicode_DATA(result);
12370
12371 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012372 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12373 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012374 }
12375 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012376 } else {
12377 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12378 return NULL;
12379 }
12380}
12381
12382static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 (lenfunc)unicode_length, /* mp_length */
12384 (binaryfunc)unicode_subscript, /* mp_subscript */
12385 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012386};
12387
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389/* Helpers for PyUnicode_Format() */
12390
12391static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012392getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012394 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 (*p_argidx)++;
12397 if (arglen < 0)
12398 return args;
12399 else
12400 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401 }
12402 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404 return NULL;
12405}
12406
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012407/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012409static PyObject *
12410formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012412 char *p;
12413 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012415
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 x = PyFloat_AsDouble(v);
12417 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012418 return NULL;
12419
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012422
Eric Smith0923d1d2009-04-16 20:16:10 +000012423 p = PyOS_double_to_string(x, type, prec,
12424 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012425 if (p == NULL)
12426 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012428 PyMem_Free(p);
12429 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430}
12431
Tim Peters38fd5b62000-09-21 05:43:11 +000012432static PyObject*
12433formatlong(PyObject *val, int flags, int prec, int type)
12434{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 char *buf;
12436 int len;
12437 PyObject *str; /* temporary string object. */
12438 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012439
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12441 if (!str)
12442 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 Py_DECREF(str);
12445 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012446}
12447
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012450 size_t buflen,
12451 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012453 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012454 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 if (PyUnicode_GET_LENGTH(v) == 1) {
12456 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012457 buf[1] = '\0';
12458 return 1;
12459 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 goto onError;
12461 }
12462 else {
12463 /* Integer input truncated to a character */
12464 long x;
12465 x = PyLong_AsLong(v);
12466 if (x == -1 && PyErr_Occurred())
12467 goto onError;
12468
12469 if (x < 0 || x > 0x10ffff) {
12470 PyErr_SetString(PyExc_OverflowError,
12471 "%c arg not in range(0x110000)");
12472 return -1;
12473 }
12474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012476 buf[1] = '\0';
12477 return 1;
12478 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012479
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012481 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012483 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484}
12485
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012486/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012487 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012488*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012489#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012490
Alexander Belopolsky40018472011-02-26 01:02:56 +000012491PyObject *
12492PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 void *fmt;
12495 int fmtkind;
12496 PyObject *result;
12497 Py_UCS4 *res, *res0;
12498 Py_UCS4 max;
12499 int kind;
12500 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012504
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 PyErr_BadInternalCall();
12507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12510 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 fmt = PyUnicode_DATA(uformat);
12513 fmtkind = PyUnicode_KIND(uformat);
12514 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12515 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
12517 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12519 if (res0 == NULL) {
12520 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
12524 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 arglen = PyTuple_Size(args);
12526 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527 }
12528 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 arglen = -1;
12530 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012532 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012533 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
12536 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 if (--rescnt < 0) {
12539 rescnt = fmtcnt + 100;
12540 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12542 if (res0 == NULL){
12543 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 }
12546 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 }
12551 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 /* Got a format specifier */
12553 int flags = 0;
12554 Py_ssize_t width = -1;
12555 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 Py_UCS4 c = '\0';
12557 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 int isnumok;
12559 PyObject *v = NULL;
12560 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 void *pbuf;
12562 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 Py_ssize_t len, len1;
12565 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 fmtpos++;
12568 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12569 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012570 Py_ssize_t keylen;
12571 PyObject *key;
12572 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012573
Benjamin Peterson29060642009-01-31 22:14:21 +000012574 if (dict == NULL) {
12575 PyErr_SetString(PyExc_TypeError,
12576 "format requires a mapping");
12577 goto onError;
12578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 /* Skip over balanced parentheses */
12583 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 if (fmtcnt < 0 || pcount > 0) {
12592 PyErr_SetString(PyExc_ValueError,
12593 "incomplete format key");
12594 goto onError;
12595 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012596 key = PyUnicode_Substring((PyObject*)uformat,
12597 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 if (key == NULL)
12599 goto onError;
12600 if (args_owned) {
12601 Py_DECREF(args);
12602 args_owned = 0;
12603 }
12604 args = PyObject_GetItem(dict, key);
12605 Py_DECREF(key);
12606 if (args == NULL) {
12607 goto onError;
12608 }
12609 args_owned = 1;
12610 arglen = -1;
12611 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012612 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 case '-': flags |= F_LJUST; continue;
12616 case '+': flags |= F_SIGN; continue;
12617 case ' ': flags |= F_BLANK; continue;
12618 case '#': flags |= F_ALT; continue;
12619 case '0': flags |= F_ZERO; continue;
12620 }
12621 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012622 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 if (c == '*') {
12624 v = getnextarg(args, arglen, &argidx);
12625 if (v == NULL)
12626 goto onError;
12627 if (!PyLong_Check(v)) {
12628 PyErr_SetString(PyExc_TypeError,
12629 "* wants int");
12630 goto onError;
12631 }
12632 width = PyLong_AsLong(v);
12633 if (width == -1 && PyErr_Occurred())
12634 goto onError;
12635 if (width < 0) {
12636 flags |= F_LJUST;
12637 width = -width;
12638 }
12639 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 }
12642 else if (c >= '0' && c <= '9') {
12643 width = c - '0';
12644 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 if (c < '0' || c > '9')
12647 break;
12648 if ((width*10) / 10 != width) {
12649 PyErr_SetString(PyExc_ValueError,
12650 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012651 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 }
12653 width = width*10 + (c - '0');
12654 }
12655 }
12656 if (c == '.') {
12657 prec = 0;
12658 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 if (c == '*') {
12661 v = getnextarg(args, arglen, &argidx);
12662 if (v == NULL)
12663 goto onError;
12664 if (!PyLong_Check(v)) {
12665 PyErr_SetString(PyExc_TypeError,
12666 "* wants int");
12667 goto onError;
12668 }
12669 prec = PyLong_AsLong(v);
12670 if (prec == -1 && PyErr_Occurred())
12671 goto onError;
12672 if (prec < 0)
12673 prec = 0;
12674 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 }
12677 else if (c >= '0' && c <= '9') {
12678 prec = c - '0';
12679 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 if (c < '0' || c > '9')
12682 break;
12683 if ((prec*10) / 10 != prec) {
12684 PyErr_SetString(PyExc_ValueError,
12685 "prec too big");
12686 goto onError;
12687 }
12688 prec = prec*10 + (c - '0');
12689 }
12690 }
12691 } /* prec */
12692 if (fmtcnt >= 0) {
12693 if (c == 'h' || c == 'l' || c == 'L') {
12694 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 }
12697 }
12698 if (fmtcnt < 0) {
12699 PyErr_SetString(PyExc_ValueError,
12700 "incomplete format");
12701 goto onError;
12702 }
12703 if (c != '%') {
12704 v = getnextarg(args, arglen, &argidx);
12705 if (v == NULL)
12706 goto onError;
12707 }
12708 sign = 0;
12709 fill = ' ';
12710 switch (c) {
12711
12712 case '%':
12713 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 len = 1;
12718 break;
12719
12720 case 's':
12721 case 'r':
12722 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012723 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 temp = v;
12725 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 }
12727 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 if (c == 's')
12729 temp = PyObject_Str(v);
12730 else if (c == 'r')
12731 temp = PyObject_Repr(v);
12732 else
12733 temp = PyObject_ASCII(v);
12734 if (temp == NULL)
12735 goto onError;
12736 if (PyUnicode_Check(temp))
12737 /* nothing to do */;
12738 else {
12739 Py_DECREF(temp);
12740 PyErr_SetString(PyExc_TypeError,
12741 "%s argument has non-string str()");
12742 goto onError;
12743 }
12744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 if (PyUnicode_READY(temp) == -1) {
12746 Py_CLEAR(temp);
12747 goto onError;
12748 }
12749 pbuf = PyUnicode_DATA(temp);
12750 kind = PyUnicode_KIND(temp);
12751 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 if (prec >= 0 && len > prec)
12753 len = prec;
12754 break;
12755
12756 case 'i':
12757 case 'd':
12758 case 'u':
12759 case 'o':
12760 case 'x':
12761 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 isnumok = 0;
12763 if (PyNumber_Check(v)) {
12764 PyObject *iobj=NULL;
12765
12766 if (PyLong_Check(v)) {
12767 iobj = v;
12768 Py_INCREF(iobj);
12769 }
12770 else {
12771 iobj = PyNumber_Long(v);
12772 }
12773 if (iobj!=NULL) {
12774 if (PyLong_Check(iobj)) {
12775 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012776 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 Py_DECREF(iobj);
12778 if (!temp)
12779 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 if (PyUnicode_READY(temp) == -1) {
12781 Py_CLEAR(temp);
12782 goto onError;
12783 }
12784 pbuf = PyUnicode_DATA(temp);
12785 kind = PyUnicode_KIND(temp);
12786 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 sign = 1;
12788 }
12789 else {
12790 Py_DECREF(iobj);
12791 }
12792 }
12793 }
12794 if (!isnumok) {
12795 PyErr_Format(PyExc_TypeError,
12796 "%%%c format: a number is required, "
12797 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12798 goto onError;
12799 }
12800 if (flags & F_ZERO)
12801 fill = '0';
12802 break;
12803
12804 case 'e':
12805 case 'E':
12806 case 'f':
12807 case 'F':
12808 case 'g':
12809 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012810 temp = formatfloat(v, flags, prec, c);
12811 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 if (PyUnicode_READY(temp) == -1) {
12814 Py_CLEAR(temp);
12815 goto onError;
12816 }
12817 pbuf = PyUnicode_DATA(temp);
12818 kind = PyUnicode_KIND(temp);
12819 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 sign = 1;
12821 if (flags & F_ZERO)
12822 fill = '0';
12823 break;
12824
12825 case 'c':
12826 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012828 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 if (len < 0)
12830 goto onError;
12831 break;
12832
12833 default:
12834 PyErr_Format(PyExc_ValueError,
12835 "unsupported format character '%c' (0x%x) "
12836 "at index %zd",
12837 (31<=c && c<=126) ? (char)c : '?',
12838 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 goto onError;
12841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 /* pbuf is initialized here. */
12843 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12846 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12847 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 len--;
12849 }
12850 else if (flags & F_SIGN)
12851 sign = '+';
12852 else if (flags & F_BLANK)
12853 sign = ' ';
12854 else
12855 sign = 0;
12856 }
12857 if (width < len)
12858 width = len;
12859 if (rescnt - (sign != 0) < width) {
12860 reslen -= rescnt;
12861 rescnt = width + fmtcnt + 100;
12862 reslen += rescnt;
12863 if (reslen < 0) {
12864 Py_XDECREF(temp);
12865 PyErr_NoMemory();
12866 goto onError;
12867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12869 if (res0 == 0) {
12870 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 Py_XDECREF(temp);
12872 goto onError;
12873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 }
12876 if (sign) {
12877 if (fill != ' ')
12878 *res++ = sign;
12879 rescnt--;
12880 if (width > len)
12881 width--;
12882 }
12883 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12885 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12888 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012889 }
12890 rescnt -= 2;
12891 width -= 2;
12892 if (width < 0)
12893 width = 0;
12894 len -= 2;
12895 }
12896 if (width > len && !(flags & F_LJUST)) {
12897 do {
12898 --rescnt;
12899 *res++ = fill;
12900 } while (--width > len);
12901 }
12902 if (fill == ' ') {
12903 if (sign)
12904 *res++ = sign;
12905 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12907 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12908 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12909 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012910 }
12911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 /* Copy all characters, preserving len */
12913 len1 = len;
12914 while (len1--) {
12915 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12916 rescnt--;
12917 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012918 while (--width >= len) {
12919 --rescnt;
12920 *res++ = ' ';
12921 }
12922 if (dict && (argidx < arglen) && c != '%') {
12923 PyErr_SetString(PyExc_TypeError,
12924 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012925 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 goto onError;
12927 }
12928 Py_XDECREF(temp);
12929 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930 } /* until end */
12931 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 PyErr_SetString(PyExc_TypeError,
12933 "not all arguments converted during string formatting");
12934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935 }
12936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937
12938 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12939 if (*res > max)
12940 max = *res;
12941 result = PyUnicode_New(reslen - rescnt, max);
12942 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012943 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 kind = PyUnicode_KIND(result);
12945 for (res = res0; res < res0+reslen-rescnt; res++)
12946 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12947 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012949 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 }
12951 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952 return (PyObject *)result;
12953
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956 Py_DECREF(uformat);
12957 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012958 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959 }
12960 return NULL;
12961}
12962
Jeremy Hylton938ace62002-07-17 16:30:39 +000012963static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012964unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12965
Tim Peters6d6c1a32001-08-02 04:15:00 +000012966static PyObject *
12967unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12968{
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970 static char *kwlist[] = {"object", "encoding", "errors", 0};
12971 char *encoding = NULL;
12972 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012973
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 if (type != &PyUnicode_Type)
12975 return unicode_subtype_new(type, args, kwds);
12976 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012978 return NULL;
12979 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981 if (encoding == NULL && errors == NULL)
12982 return PyObject_Str(x);
12983 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012985}
12986
Guido van Rossume023fe02001-08-30 03:12:59 +000012987static PyObject *
12988unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12989{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012990 PyUnicodeObject *unicode, *self;
12991 Py_ssize_t length, char_size;
12992 int share_wstr, share_utf8;
12993 unsigned int kind;
12994 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012995
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012997
12998 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12999 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013001 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013002 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013003 return NULL;
13004
13005 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13006 if (self == NULL) {
13007 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013008 return NULL;
13009 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013010 kind = PyUnicode_KIND(unicode);
13011 length = PyUnicode_GET_LENGTH(unicode);
13012
13013 _PyUnicode_LENGTH(self) = length;
13014 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13015 _PyUnicode_STATE(self).interned = 0;
13016 _PyUnicode_STATE(self).kind = kind;
13017 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013018 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013019 _PyUnicode_STATE(self).ready = 1;
13020 _PyUnicode_WSTR(self) = NULL;
13021 _PyUnicode_UTF8_LENGTH(self) = 0;
13022 _PyUnicode_UTF8(self) = NULL;
13023 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013024 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013025
13026 share_utf8 = 0;
13027 share_wstr = 0;
13028 if (kind == PyUnicode_1BYTE_KIND) {
13029 char_size = 1;
13030 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13031 share_utf8 = 1;
13032 }
13033 else if (kind == PyUnicode_2BYTE_KIND) {
13034 char_size = 2;
13035 if (sizeof(wchar_t) == 2)
13036 share_wstr = 1;
13037 }
13038 else {
13039 assert(kind == PyUnicode_4BYTE_KIND);
13040 char_size = 4;
13041 if (sizeof(wchar_t) == 4)
13042 share_wstr = 1;
13043 }
13044
13045 /* Ensure we won't overflow the length. */
13046 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13047 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013050 data = PyObject_MALLOC((length + 1) * char_size);
13051 if (data == NULL) {
13052 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 goto onError;
13054 }
13055
Victor Stinnerc3c74152011-10-02 20:39:55 +020013056 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013057 if (share_utf8) {
13058 _PyUnicode_UTF8_LENGTH(self) = length;
13059 _PyUnicode_UTF8(self) = data;
13060 }
13061 if (share_wstr) {
13062 _PyUnicode_WSTR_LENGTH(self) = length;
13063 _PyUnicode_WSTR(self) = (wchar_t *)data;
13064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013066 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13067 PyUnicode_KIND_SIZE(kind, length + 1));
13068 Py_DECREF(unicode);
13069 return (PyObject *)self;
13070
13071onError:
13072 Py_DECREF(unicode);
13073 Py_DECREF(self);
13074 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013075}
13076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013077PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013079\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013080Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013081encoding defaults to the current default string encoding.\n\
13082errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013083
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013084static PyObject *unicode_iter(PyObject *seq);
13085
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013087 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 "str", /* tp_name */
13089 sizeof(PyUnicodeObject), /* tp_size */
13090 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013092 (destructor)unicode_dealloc, /* tp_dealloc */
13093 0, /* tp_print */
13094 0, /* tp_getattr */
13095 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013096 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013097 unicode_repr, /* tp_repr */
13098 &unicode_as_number, /* tp_as_number */
13099 &unicode_as_sequence, /* tp_as_sequence */
13100 &unicode_as_mapping, /* tp_as_mapping */
13101 (hashfunc) unicode_hash, /* tp_hash*/
13102 0, /* tp_call*/
13103 (reprfunc) unicode_str, /* tp_str */
13104 PyObject_GenericGetAttr, /* tp_getattro */
13105 0, /* tp_setattro */
13106 0, /* tp_as_buffer */
13107 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 unicode_doc, /* tp_doc */
13110 0, /* tp_traverse */
13111 0, /* tp_clear */
13112 PyUnicode_RichCompare, /* tp_richcompare */
13113 0, /* tp_weaklistoffset */
13114 unicode_iter, /* tp_iter */
13115 0, /* tp_iternext */
13116 unicode_methods, /* tp_methods */
13117 0, /* tp_members */
13118 0, /* tp_getset */
13119 &PyBaseObject_Type, /* tp_base */
13120 0, /* tp_dict */
13121 0, /* tp_descr_get */
13122 0, /* tp_descr_set */
13123 0, /* tp_dictoffset */
13124 0, /* tp_init */
13125 0, /* tp_alloc */
13126 unicode_new, /* tp_new */
13127 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128};
13129
13130/* Initialize the Unicode implementation */
13131
Thomas Wouters78890102000-07-22 19:25:51 +000013132void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013134 int i;
13135
Thomas Wouters477c8d52006-05-27 19:21:47 +000013136 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013138 0x000A, /* LINE FEED */
13139 0x000D, /* CARRIAGE RETURN */
13140 0x001C, /* FILE SEPARATOR */
13141 0x001D, /* GROUP SEPARATOR */
13142 0x001E, /* RECORD SEPARATOR */
13143 0x0085, /* NEXT LINE */
13144 0x2028, /* LINE SEPARATOR */
13145 0x2029, /* PARAGRAPH SEPARATOR */
13146 };
13147
Fred Drakee4315f52000-05-09 19:53:39 +000013148 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013149 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013150 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013153 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013155 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013157
13158 /* initialize the linebreak bloom filter */
13159 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013161 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162
13163 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164}
13165
13166/* Finalize the Unicode implementation */
13167
Christian Heimesa156e092008-02-16 07:38:31 +000013168int
13169PyUnicode_ClearFreeList(void)
13170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013172}
13173
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174void
Thomas Wouters78890102000-07-22 19:25:51 +000013175_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013177 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013179 Py_XDECREF(unicode_empty);
13180 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013182 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 if (unicode_latin1[i]) {
13184 Py_DECREF(unicode_latin1[i]);
13185 unicode_latin1[i] = NULL;
13186 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013187 }
Christian Heimesa156e092008-02-16 07:38:31 +000013188 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013190
Walter Dörwald16807132007-05-25 13:52:07 +000013191void
13192PyUnicode_InternInPlace(PyObject **p)
13193{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13195 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013196#ifdef Py_DEBUG
13197 assert(s != NULL);
13198 assert(_PyUnicode_CHECK(s));
13199#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013201 return;
13202#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013203 /* If it's a subclass, we don't really know what putting
13204 it in the interned dict might do. */
13205 if (!PyUnicode_CheckExact(s))
13206 return;
13207 if (PyUnicode_CHECK_INTERNED(s))
13208 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013209 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013210 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 return;
13212 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013213 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 if (interned == NULL) {
13215 interned = PyDict_New();
13216 if (interned == NULL) {
13217 PyErr_Clear(); /* Don't leave an exception */
13218 return;
13219 }
13220 }
13221 /* It might be that the GetItem call fails even
13222 though the key is present in the dictionary,
13223 namely when this happens during a stack overflow. */
13224 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013226 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013227
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 if (t) {
13229 Py_INCREF(t);
13230 Py_DECREF(*p);
13231 *p = t;
13232 return;
13233 }
Walter Dörwald16807132007-05-25 13:52:07 +000013234
Benjamin Peterson14339b62009-01-31 16:36:08 +000013235 PyThreadState_GET()->recursion_critical = 1;
13236 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13237 PyErr_Clear();
13238 PyThreadState_GET()->recursion_critical = 0;
13239 return;
13240 }
13241 PyThreadState_GET()->recursion_critical = 0;
13242 /* The two references in interned are not counted by refcnt.
13243 The deallocator will take care of this */
13244 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013246}
13247
13248void
13249PyUnicode_InternImmortal(PyObject **p)
13250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13252
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 PyUnicode_InternInPlace(p);
13254 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013256 Py_INCREF(*p);
13257 }
Walter Dörwald16807132007-05-25 13:52:07 +000013258}
13259
13260PyObject *
13261PyUnicode_InternFromString(const char *cp)
13262{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 PyObject *s = PyUnicode_FromString(cp);
13264 if (s == NULL)
13265 return NULL;
13266 PyUnicode_InternInPlace(&s);
13267 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013268}
13269
Alexander Belopolsky40018472011-02-26 01:02:56 +000013270void
13271_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 PyObject *keys;
13274 PyUnicodeObject *s;
13275 Py_ssize_t i, n;
13276 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013277
Benjamin Peterson14339b62009-01-31 16:36:08 +000013278 if (interned == NULL || !PyDict_Check(interned))
13279 return;
13280 keys = PyDict_Keys(interned);
13281 if (keys == NULL || !PyList_Check(keys)) {
13282 PyErr_Clear();
13283 return;
13284 }
Walter Dörwald16807132007-05-25 13:52:07 +000013285
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13287 detector, interned unicode strings are not forcibly deallocated;
13288 rather, we give them their stolen references back, and then clear
13289 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013290
Benjamin Peterson14339b62009-01-31 16:36:08 +000013291 n = PyList_GET_SIZE(keys);
13292 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 for (i = 0; i < n; i++) {
13295 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013296 if (PyUnicode_READY(s) == -1) {
13297 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 case SSTATE_NOT_INTERNED:
13302 /* XXX Shouldn't happen */
13303 break;
13304 case SSTATE_INTERNED_IMMORTAL:
13305 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013307 break;
13308 case SSTATE_INTERNED_MORTAL:
13309 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 break;
13312 default:
13313 Py_FatalError("Inconsistent interned string state.");
13314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013316 }
13317 fprintf(stderr, "total size of all interned strings: "
13318 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13319 "mortal/immortal\n", mortal_size, immortal_size);
13320 Py_DECREF(keys);
13321 PyDict_Clear(interned);
13322 Py_DECREF(interned);
13323 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013324}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013325
13326
13327/********************* Unicode Iterator **************************/
13328
13329typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013330 PyObject_HEAD
13331 Py_ssize_t it_index;
13332 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013333} unicodeiterobject;
13334
13335static void
13336unicodeiter_dealloc(unicodeiterobject *it)
13337{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013338 _PyObject_GC_UNTRACK(it);
13339 Py_XDECREF(it->it_seq);
13340 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013341}
13342
13343static int
13344unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13345{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346 Py_VISIT(it->it_seq);
13347 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013348}
13349
13350static PyObject *
13351unicodeiter_next(unicodeiterobject *it)
13352{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013353 PyUnicodeObject *seq;
13354 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013355
Benjamin Peterson14339b62009-01-31 16:36:08 +000013356 assert(it != NULL);
13357 seq = it->it_seq;
13358 if (seq == NULL)
13359 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013360 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13363 int kind = PyUnicode_KIND(seq);
13364 void *data = PyUnicode_DATA(seq);
13365 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13366 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013367 if (item != NULL)
13368 ++it->it_index;
13369 return item;
13370 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013371
Benjamin Peterson14339b62009-01-31 16:36:08 +000013372 Py_DECREF(seq);
13373 it->it_seq = NULL;
13374 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013375}
13376
13377static PyObject *
13378unicodeiter_len(unicodeiterobject *it)
13379{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013380 Py_ssize_t len = 0;
13381 if (it->it_seq)
13382 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13383 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013384}
13385
13386PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13387
13388static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013389 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013392};
13393
13394PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013395 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13396 "str_iterator", /* tp_name */
13397 sizeof(unicodeiterobject), /* tp_basicsize */
13398 0, /* tp_itemsize */
13399 /* methods */
13400 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13401 0, /* tp_print */
13402 0, /* tp_getattr */
13403 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013404 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 0, /* tp_repr */
13406 0, /* tp_as_number */
13407 0, /* tp_as_sequence */
13408 0, /* tp_as_mapping */
13409 0, /* tp_hash */
13410 0, /* tp_call */
13411 0, /* tp_str */
13412 PyObject_GenericGetAttr, /* tp_getattro */
13413 0, /* tp_setattro */
13414 0, /* tp_as_buffer */
13415 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13416 0, /* tp_doc */
13417 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13418 0, /* tp_clear */
13419 0, /* tp_richcompare */
13420 0, /* tp_weaklistoffset */
13421 PyObject_SelfIter, /* tp_iter */
13422 (iternextfunc)unicodeiter_next, /* tp_iternext */
13423 unicodeiter_methods, /* tp_methods */
13424 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013425};
13426
13427static PyObject *
13428unicode_iter(PyObject *seq)
13429{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013430 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013431
Benjamin Peterson14339b62009-01-31 16:36:08 +000013432 if (!PyUnicode_Check(seq)) {
13433 PyErr_BadInternalCall();
13434 return NULL;
13435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 if (PyUnicode_READY(seq) == -1)
13437 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13439 if (it == NULL)
13440 return NULL;
13441 it->it_index = 0;
13442 Py_INCREF(seq);
13443 it->it_seq = (PyUnicodeObject *)seq;
13444 _PyObject_GC_TRACK(it);
13445 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013446}
13447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448#define UNIOP(x) Py_UNICODE_##x
13449#define UNIOP_t Py_UNICODE
13450#include "uniops.h"
13451#undef UNIOP
13452#undef UNIOP_t
13453#define UNIOP(x) Py_UCS4_##x
13454#define UNIOP_t Py_UCS4
13455#include "uniops.h"
13456#undef UNIOP
13457#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013458
Victor Stinner71133ff2010-09-01 23:43:53 +000013459Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013460PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013461{
13462 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13463 Py_UNICODE *copy;
13464 Py_ssize_t size;
13465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 if (!PyUnicode_Check(unicode)) {
13467 PyErr_BadArgument();
13468 return NULL;
13469 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013470 /* Ensure we won't overflow the size. */
13471 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13472 PyErr_NoMemory();
13473 return NULL;
13474 }
13475 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13476 size *= sizeof(Py_UNICODE);
13477 copy = PyMem_Malloc(size);
13478 if (copy == NULL) {
13479 PyErr_NoMemory();
13480 return NULL;
13481 }
13482 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13483 return copy;
13484}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013485
Georg Brandl66c221e2010-10-14 07:04:07 +000013486/* A _string module, to export formatter_parser and formatter_field_name_split
13487 to the string.Formatter class implemented in Python. */
13488
13489static PyMethodDef _string_methods[] = {
13490 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13491 METH_O, PyDoc_STR("split the argument as a field name")},
13492 {"formatter_parser", (PyCFunction) formatter_parser,
13493 METH_O, PyDoc_STR("parse the argument as a format string")},
13494 {NULL, NULL}
13495};
13496
13497static struct PyModuleDef _string_module = {
13498 PyModuleDef_HEAD_INIT,
13499 "_string",
13500 PyDoc_STR("string helper module"),
13501 0,
13502 _string_methods,
13503 NULL,
13504 NULL,
13505 NULL,
13506 NULL
13507};
13508
13509PyMODINIT_FUNC
13510PyInit__string(void)
13511{
13512 return PyModule_Create(&_string_module);
13513}
13514
13515
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013516#ifdef __cplusplus
13517}
13518#endif