blob: b628eeb93a83e91d9960b5c2151dd5ef51ea03c3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
972 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001018 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001019 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001020 /* slow path to check for character overflow */
1021 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1022 Py_UCS4 ch, maxchar;
1023 Py_ssize_t i;
1024
1025 maxchar = 0;
1026 invalid_kinds = 0;
1027 for (i=0; i < how_many; i++) {
1028 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1029 if (ch > maxchar) {
1030 maxchar = ch;
1031 if (maxchar > to_maxchar) {
1032 invalid_kinds = 1;
1033 break;
1034 }
1035 }
1036 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1037 }
1038 }
1039 else
1040 invalid_kinds = 1;
1041 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001042 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 "Cannot copy %s characters "
1044 "into a string of %s characters",
1045 unicode_kind_name(from),
1046 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001047 return -1;
1048 }
1049 }
1050 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051}
1052
Victor Stinner17222162011-09-28 22:15:37 +02001053/* Find the maximum code point and count the number of surrogate pairs so a
1054 correct string length can be computed before converting a string to UCS4.
1055 This function counts single surrogates as a character and not as a pair.
1056
1057 Return 0 on success, or -1 on error. */
1058static int
1059find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1060 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061{
1062 const wchar_t *iter;
1063
Victor Stinnerc53be962011-10-02 21:33:54 +02001064 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 if (num_surrogates == NULL || maxchar == NULL) {
1066 PyErr_SetString(PyExc_SystemError,
1067 "unexpected NULL arguments to "
1068 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1069 return -1;
1070 }
1071
1072 *num_surrogates = 0;
1073 *maxchar = 0;
1074
1075 for (iter = begin; iter < end; ) {
1076 if (*iter > *maxchar)
1077 *maxchar = *iter;
1078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
1518_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject *res;
1521 unsigned char max = 127;
1522 Py_ssize_t i;
1523 for (i = 0; i < size; i++) {
1524 if (u[i] & 0x80) {
1525 max = 255;
1526 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001527 }
1528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 res = PyUnicode_New(size, max);
1530 if (!res)
1531 return NULL;
1532 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1533 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001534}
1535
Victor Stinnere57b1c02011-09-28 22:20:48 +02001536static PyObject*
1537_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538{
1539 PyObject *res;
1540 Py_UCS2 max = 0;
1541 Py_ssize_t i;
1542 for (i = 0; i < size; i++)
1543 if (u[i] > max)
1544 max = u[i];
1545 res = PyUnicode_New(size, max);
1546 if (!res)
1547 return NULL;
1548 if (max >= 256)
1549 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1550 else
1551 for (i = 0; i < size; i++)
1552 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1553 return res;
1554}
1555
Victor Stinnere57b1c02011-09-28 22:20:48 +02001556static PyObject*
1557_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558{
1559 PyObject *res;
1560 Py_UCS4 max = 0;
1561 Py_ssize_t i;
1562 for (i = 0; i < size; i++)
1563 if (u[i] > max)
1564 max = u[i];
1565 res = PyUnicode_New(size, max);
1566 if (!res)
1567 return NULL;
1568 if (max >= 0x10000)
1569 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1570 else {
1571 int kind = PyUnicode_KIND(res);
1572 void *data = PyUnicode_DATA(res);
1573 for (i = 0; i < size; i++)
1574 PyUnicode_WRITE(kind, data, i, u[i]);
1575 }
1576 return res;
1577}
1578
1579PyObject*
1580PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1581{
1582 switch(kind) {
1583 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001584 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001586 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001588 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 }
Victor Stinner01698042011-10-04 00:04:26 +02001590 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 return NULL;
1592}
1593
Victor Stinner034f6cf2011-09-30 02:26:44 +02001594PyObject*
1595PyUnicode_Copy(PyObject *unicode)
1596{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001597 Py_ssize_t size;
1598 PyObject *copy;
1599 void *data;
1600
Victor Stinner034f6cf2011-09-30 02:26:44 +02001601 if (!PyUnicode_Check(unicode)) {
1602 PyErr_BadInternalCall();
1603 return NULL;
1604 }
1605 if (PyUnicode_READY(unicode))
1606 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001607
1608 size = PyUnicode_GET_LENGTH(unicode);
1609 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1610 if (!copy)
1611 return NULL;
1612 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1613
1614 data = PyUnicode_DATA(unicode);
1615 switch (PyUnicode_KIND(unicode))
1616 {
1617 case PyUnicode_1BYTE_KIND:
1618 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1619 break;
1620 case PyUnicode_2BYTE_KIND:
1621 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1622 break;
1623 case PyUnicode_4BYTE_KIND:
1624 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1625 break;
1626 default:
1627 assert(0);
1628 break;
1629 }
1630 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001631}
1632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634/* Widen Unicode objects to larger buffers. Don't write terminating null
1635 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636
1637void*
1638_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1639{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001640 Py_ssize_t len;
1641 void *result;
1642 unsigned int skind;
1643
1644 if (PyUnicode_READY(s))
1645 return NULL;
1646
1647 len = PyUnicode_GET_LENGTH(s);
1648 skind = PyUnicode_KIND(s);
1649 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001650 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return NULL;
1652 }
1653 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001654 case PyUnicode_2BYTE_KIND:
1655 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1656 if (!result)
1657 return PyErr_NoMemory();
1658 assert(skind == PyUnicode_1BYTE_KIND);
1659 _PyUnicode_CONVERT_BYTES(
1660 Py_UCS1, Py_UCS2,
1661 PyUnicode_1BYTE_DATA(s),
1662 PyUnicode_1BYTE_DATA(s) + len,
1663 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001665 case PyUnicode_4BYTE_KIND:
1666 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1667 if (!result)
1668 return PyErr_NoMemory();
1669 if (skind == PyUnicode_2BYTE_KIND) {
1670 _PyUnicode_CONVERT_BYTES(
1671 Py_UCS2, Py_UCS4,
1672 PyUnicode_2BYTE_DATA(s),
1673 PyUnicode_2BYTE_DATA(s) + len,
1674 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676 else {
1677 assert(skind == PyUnicode_1BYTE_KIND);
1678 _PyUnicode_CONVERT_BYTES(
1679 Py_UCS1, Py_UCS4,
1680 PyUnicode_1BYTE_DATA(s),
1681 PyUnicode_1BYTE_DATA(s) + len,
1682 result);
1683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001685 default:
1686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
Victor Stinner01698042011-10-04 00:04:26 +02001688 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 return NULL;
1690}
1691
1692static Py_UCS4*
1693as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1694 int copy_null)
1695{
1696 int kind;
1697 void *data;
1698 Py_ssize_t len, targetlen;
1699 if (PyUnicode_READY(string) == -1)
1700 return NULL;
1701 kind = PyUnicode_KIND(string);
1702 data = PyUnicode_DATA(string);
1703 len = PyUnicode_GET_LENGTH(string);
1704 targetlen = len;
1705 if (copy_null)
1706 targetlen++;
1707 if (!target) {
1708 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1709 PyErr_NoMemory();
1710 return NULL;
1711 }
1712 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1713 if (!target) {
1714 PyErr_NoMemory();
1715 return NULL;
1716 }
1717 }
1718 else {
1719 if (targetsize < targetlen) {
1720 PyErr_Format(PyExc_SystemError,
1721 "string is longer than the buffer");
1722 if (copy_null && 0 < targetsize)
1723 target[0] = 0;
1724 return NULL;
1725 }
1726 }
1727 if (kind != PyUnicode_4BYTE_KIND) {
1728 Py_ssize_t i;
1729 for (i = 0; i < len; i++)
1730 target[i] = PyUnicode_READ(kind, data, i);
1731 }
1732 else
1733 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1734 if (copy_null)
1735 target[len] = 0;
1736 return target;
1737}
1738
1739Py_UCS4*
1740PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1741 int copy_null)
1742{
1743 if (target == NULL || targetsize < 1) {
1744 PyErr_BadInternalCall();
1745 return NULL;
1746 }
1747 return as_ucs4(string, target, targetsize, copy_null);
1748}
1749
1750Py_UCS4*
1751PyUnicode_AsUCS4Copy(PyObject *string)
1752{
1753 return as_ucs4(string, NULL, 0, 1);
1754}
1755
1756#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757
Alexander Belopolsky40018472011-02-26 01:02:56 +00001758PyObject *
1759PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001762 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 PyErr_BadInternalCall();
1765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
1767
Martin v. Löwis790465f2008-04-05 20:41:37 +00001768 if (size == -1) {
1769 size = wcslen(w);
1770 }
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776
Walter Dörwald346737f2007-05-31 10:44:43 +00001777static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001778makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1779 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 *fmt++ = '%';
1782 if (width) {
1783 if (zeropad)
1784 *fmt++ = '0';
1785 fmt += sprintf(fmt, "%d", width);
1786 }
1787 if (precision)
1788 fmt += sprintf(fmt, ".%d", precision);
1789 if (longflag)
1790 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001791 else if (longlongflag) {
1792 /* longlongflag should only ever be nonzero on machines with
1793 HAVE_LONG_LONG defined */
1794#ifdef HAVE_LONG_LONG
1795 char *f = PY_FORMAT_LONG_LONG;
1796 while (*f)
1797 *fmt++ = *f++;
1798#else
1799 /* we shouldn't ever get here */
1800 assert(0);
1801 *fmt++ = 'l';
1802#endif
1803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 else if (size_tflag) {
1805 char *f = PY_FORMAT_SIZE_T;
1806 while (*f)
1807 *fmt++ = *f++;
1808 }
1809 *fmt++ = c;
1810 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001811}
1812
Victor Stinner96865452011-03-01 23:44:09 +00001813/* helper for PyUnicode_FromFormatV() */
1814
1815static const char*
1816parse_format_flags(const char *f,
1817 int *p_width, int *p_precision,
1818 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1819{
1820 int width, precision, longflag, longlongflag, size_tflag;
1821
1822 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1823 f++;
1824 width = 0;
1825 while (Py_ISDIGIT((unsigned)*f))
1826 width = (width*10) + *f++ - '0';
1827 precision = 0;
1828 if (*f == '.') {
1829 f++;
1830 while (Py_ISDIGIT((unsigned)*f))
1831 precision = (precision*10) + *f++ - '0';
1832 if (*f == '%') {
1833 /* "%.3%s" => f points to "3" */
1834 f--;
1835 }
1836 }
1837 if (*f == '\0') {
1838 /* bogus format "%.1" => go backward, f points to "1" */
1839 f--;
1840 }
1841 if (p_width != NULL)
1842 *p_width = width;
1843 if (p_precision != NULL)
1844 *p_precision = precision;
1845
1846 /* Handle %ld, %lu, %lld and %llu. */
1847 longflag = 0;
1848 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001849 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001850
1851 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001853 longflag = 1;
1854 ++f;
1855 }
1856#ifdef HAVE_LONG_LONG
1857 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001858 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001859 longlongflag = 1;
1860 f += 2;
1861 }
1862#endif
1863 }
1864 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001866 size_tflag = 1;
1867 ++f;
1868 }
1869 if (p_longflag != NULL)
1870 *p_longflag = longflag;
1871 if (p_longlongflag != NULL)
1872 *p_longlongflag = longlongflag;
1873 if (p_size_tflag != NULL)
1874 *p_size_tflag = size_tflag;
1875 return f;
1876}
1877
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001878/* maximum number of characters required for output of %ld. 21 characters
1879 allows for 64-bit integers (in decimal) and an optional sign. */
1880#define MAX_LONG_CHARS 21
1881/* maximum number of characters required for output of %lld.
1882 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1883 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1884#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1885
Walter Dörwaldd2034312007-05-18 16:29:38 +00001886PyObject *
1887PyUnicode_FromFormatV(const char *format, va_list vargs)
1888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 va_list count;
1890 Py_ssize_t callcount = 0;
1891 PyObject **callresults = NULL;
1892 PyObject **callresult = NULL;
1893 Py_ssize_t n = 0;
1894 int width = 0;
1895 int precision = 0;
1896 int zeropad;
1897 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001900 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1902 Py_UCS4 argmaxchar;
1903 Py_ssize_t numbersize = 0;
1904 char *numberresults = NULL;
1905 char *numberresult = NULL;
1906 Py_ssize_t i;
1907 int kind;
1908 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001910 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001911 /* step 1: count the number of %S/%R/%A/%s format specifications
1912 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1913 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 * result in an array)
1915 * also esimate a upper bound for all the number formats in the string,
1916 * numbers will be formated in step 3 and be keept in a '\0'-separated
1917 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 for (f = format; *f; f++) {
1919 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001920 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1922 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1923 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1924 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001927#ifdef HAVE_LONG_LONG
1928 if (longlongflag) {
1929 if (width < MAX_LONG_LONG_CHARS)
1930 width = MAX_LONG_LONG_CHARS;
1931 }
1932 else
1933#endif
1934 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1935 including sign. Decimal takes the most space. This
1936 isn't enough for octal. If a width is specified we
1937 need more (which we allocate later). */
1938 if (width < MAX_LONG_CHARS)
1939 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940
1941 /* account for the size + '\0' to separate numbers
1942 inside of the numberresults buffer */
1943 numbersize += (width + 1);
1944 }
1945 }
1946 else if ((unsigned char)*f > 127) {
1947 PyErr_Format(PyExc_ValueError,
1948 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1949 "string, got a non-ASCII byte: 0x%02x",
1950 (unsigned char)*f);
1951 return NULL;
1952 }
1953 }
1954 /* step 2: allocate memory for the results of
1955 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1956 if (callcount) {
1957 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1958 if (!callresults) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 callresult = callresults;
1963 }
1964 /* step 2.5: allocate memory for the results of formating numbers */
1965 if (numbersize) {
1966 numberresults = PyObject_Malloc(numbersize);
1967 if (!numberresults) {
1968 PyErr_NoMemory();
1969 goto fail;
1970 }
1971 numberresult = numberresults;
1972 }
1973
1974 /* step 3: format numbers and figure out how large a buffer we need */
1975 for (f = format; *f; f++) {
1976 if (*f == '%') {
1977 const char* p;
1978 int longflag;
1979 int longlongflag;
1980 int size_tflag;
1981 int numprinted;
1982
1983 p = f;
1984 zeropad = (f[1] == '0');
1985 f = parse_format_flags(f, &width, &precision,
1986 &longflag, &longlongflag, &size_tflag);
1987 switch (*f) {
1988 case 'c':
1989 {
1990 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001991 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 n++;
1993 break;
1994 }
1995 case '%':
1996 n++;
1997 break;
1998 case 'i':
1999 case 'd':
2000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2001 width, precision, *f);
2002 if (longflag)
2003 numprinted = sprintf(numberresult, fmt,
2004 va_arg(count, long));
2005#ifdef HAVE_LONG_LONG
2006 else if (longlongflag)
2007 numprinted = sprintf(numberresult, fmt,
2008 va_arg(count, PY_LONG_LONG));
2009#endif
2010 else if (size_tflag)
2011 numprinted = sprintf(numberresult, fmt,
2012 va_arg(count, Py_ssize_t));
2013 else
2014 numprinted = sprintf(numberresult, fmt,
2015 va_arg(count, int));
2016 n += numprinted;
2017 /* advance by +1 to skip over the '\0' */
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
2023 break;
2024 case 'u':
2025 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2026 width, precision, 'u');
2027 if (longflag)
2028 numprinted = sprintf(numberresult, fmt,
2029 va_arg(count, unsigned long));
2030#ifdef HAVE_LONG_LONG
2031 else if (longlongflag)
2032 numprinted = sprintf(numberresult, fmt,
2033 va_arg(count, unsigned PY_LONG_LONG));
2034#endif
2035 else if (size_tflag)
2036 numprinted = sprintf(numberresult, fmt,
2037 va_arg(count, size_t));
2038 else
2039 numprinted = sprintf(numberresult, fmt,
2040 va_arg(count, unsigned int));
2041 n += numprinted;
2042 numberresult += (numprinted + 1);
2043 assert(*(numberresult - 1) == '\0');
2044 assert(*(numberresult - 2) != '\0');
2045 assert(numprinted >= 0);
2046 assert(numberresult <= numberresults + numbersize);
2047 break;
2048 case 'x':
2049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2050 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2051 n += numprinted;
2052 numberresult += (numprinted + 1);
2053 assert(*(numberresult - 1) == '\0');
2054 assert(*(numberresult - 2) != '\0');
2055 assert(numprinted >= 0);
2056 assert(numberresult <= numberresults + numbersize);
2057 break;
2058 case 'p':
2059 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2060 /* %p is ill-defined: ensure leading 0x. */
2061 if (numberresult[1] == 'X')
2062 numberresult[1] = 'x';
2063 else if (numberresult[1] != 'x') {
2064 memmove(numberresult + 2, numberresult,
2065 strlen(numberresult) + 1);
2066 numberresult[0] = '0';
2067 numberresult[1] = 'x';
2068 numprinted += 2;
2069 }
2070 n += numprinted;
2071 numberresult += (numprinted + 1);
2072 assert(*(numberresult - 1) == '\0');
2073 assert(*(numberresult - 2) != '\0');
2074 assert(numprinted >= 0);
2075 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 break;
2077 case 's':
2078 {
2079 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002080 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002081 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2082 if (!str)
2083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 /* since PyUnicode_DecodeUTF8 returns already flexible
2085 unicode objects, there is no need to call ready on them */
2086 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002087 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002089 /* Remember the str and switch to the next slot */
2090 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 break;
2092 }
2093 case 'U':
2094 {
2095 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002096 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (PyUnicode_READY(obj) == -1)
2098 goto fail;
2099 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002100 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 break;
2103 }
2104 case 'V':
2105 {
2106 PyObject *obj = va_arg(count, PyObject *);
2107 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002108 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002110 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002111 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 if (PyUnicode_READY(obj) == -1)
2113 goto fail;
2114 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002115 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002117 *callresult++ = NULL;
2118 }
2119 else {
2120 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2121 if (!str_obj)
2122 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002124 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002126 *callresult++ = str_obj;
2127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 break;
2129 }
2130 case 'S':
2131 {
2132 PyObject *obj = va_arg(count, PyObject *);
2133 PyObject *str;
2134 assert(obj);
2135 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002139 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 /* Remember the str and switch to the next slot */
2142 *callresult++ = str;
2143 break;
2144 }
2145 case 'R':
2146 {
2147 PyObject *obj = va_arg(count, PyObject *);
2148 PyObject *repr;
2149 assert(obj);
2150 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002154 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 /* Remember the repr and switch to the next slot */
2157 *callresult++ = repr;
2158 break;
2159 }
2160 case 'A':
2161 {
2162 PyObject *obj = va_arg(count, PyObject *);
2163 PyObject *ascii;
2164 assert(obj);
2165 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002169 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 /* Remember the repr and switch to the next slot */
2172 *callresult++ = ascii;
2173 break;
2174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002175 default:
2176 /* if we stumble upon an unknown
2177 formatting code, copy the rest of
2178 the format string to the output
2179 string. (we cannot just skip the
2180 code, since there's no way to know
2181 what's in the argument list) */
2182 n += strlen(p);
2183 goto expand;
2184 }
2185 } else
2186 n++;
2187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002191 we don't have to resize the string.
2192 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 if (!string)
2195 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 kind = PyUnicode_KIND(string);
2197 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002203 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002204
2205 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2207 /* checking for == because the last argument could be a empty
2208 string, which causes i to point to end, the assert at the end of
2209 the loop */
2210 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002211
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 switch (*f) {
2213 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002214 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 const int ordinal = va_arg(vargs, int);
2216 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002218 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 case 'p':
2224 /* unused, since we already have the result */
2225 if (*f == 'p')
2226 (void) va_arg(vargs, void *);
2227 else
2228 (void) va_arg(vargs, int);
2229 /* extract the result from numberresults and append. */
2230 for (; *numberresult; ++i, ++numberresult)
2231 PyUnicode_WRITE(kind, data, i, *numberresult);
2232 /* skip over the separating '\0' */
2233 assert(*numberresult == '\0');
2234 numberresult++;
2235 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 break;
2237 case 's':
2238 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002239 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002241 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 size = PyUnicode_GET_LENGTH(*callresult);
2243 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002244 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2245 *callresult, 0,
2246 size) < 0)
2247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002249 /* We're done with the unicode()/repr() => forget it */
2250 Py_DECREF(*callresult);
2251 /* switch to next unicode()/repr() result */
2252 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 break;
2254 }
2255 case 'U':
2256 {
2257 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 Py_ssize_t size;
2259 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2260 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002261 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2262 obj, 0,
2263 size) < 0)
2264 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 break;
2267 }
2268 case 'V':
2269 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002272 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 size = PyUnicode_GET_LENGTH(obj);
2275 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002276 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2277 obj, 0,
2278 size) < 0)
2279 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 size = PyUnicode_GET_LENGTH(*callresult);
2283 assert(PyUnicode_KIND(*callresult) <=
2284 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002285 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2286 *callresult,
2287 0, size) < 0)
2288 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002290 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002291 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002292 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 break;
2294 }
2295 case 'S':
2296 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002297 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 /* unused, since we already have the result */
2300 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002302 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2303 *callresult, 0,
2304 PyUnicode_GET_LENGTH(*callresult)) < 0)
2305 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 /* We're done with the unicode()/repr() => forget it */
2308 Py_DECREF(*callresult);
2309 /* switch to next unicode()/repr() result */
2310 ++callresult;
2311 break;
2312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 break;
2316 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 for (; *p; ++p, ++i)
2318 PyUnicode_WRITE(kind, data, i, *p);
2319 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 goto end;
2321 }
Victor Stinner1205f272010-09-11 00:54:47 +00002322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 else {
2324 assert(i < PyUnicode_GET_LENGTH(string));
2325 PyUnicode_WRITE(kind, data, i++, *f);
2326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002329
Benjamin Peterson29060642009-01-31 22:14:21 +00002330 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 if (callresults)
2332 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 if (numberresults)
2334 PyObject_Free(numberresults);
2335 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 if (callresults) {
2338 PyObject **callresult2 = callresults;
2339 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002340 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 ++callresult2;
2342 }
2343 PyObject_Free(callresults);
2344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (numberresults)
2346 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002348}
2349
Walter Dörwaldd2034312007-05-18 16:29:38 +00002350PyObject *
2351PyUnicode_FromFormat(const char *format, ...)
2352{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 PyObject* ret;
2354 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355
2356#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002358#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002360#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 ret = PyUnicode_FromFormatV(format, vargs);
2362 va_end(vargs);
2363 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002364}
2365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366#ifdef HAVE_WCHAR_H
2367
Victor Stinner5593d8a2010-10-02 11:11:27 +00002368/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2369 convert a Unicode object to a wide character string.
2370
Victor Stinnerd88d9832011-09-06 02:00:05 +02002371 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002372 character) required to convert the unicode object. Ignore size argument.
2373
Victor Stinnerd88d9832011-09-06 02:00:05 +02002374 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002375 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002376 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002377static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002378unicode_aswidechar(PyUnicodeObject *unicode,
2379 wchar_t *w,
2380 Py_ssize_t size)
2381{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002382 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 const wchar_t *wstr;
2384
2385 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2386 if (wstr == NULL)
2387 return -1;
2388
Victor Stinner5593d8a2010-10-02 11:11:27 +00002389 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002390 if (size > res)
2391 size = res + 1;
2392 else
2393 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002395 return res;
2396 }
2397 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002399}
2400
2401Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002402PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002403 wchar_t *w,
2404 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405{
2406 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 PyErr_BadInternalCall();
2408 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002410 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411}
2412
Victor Stinner137c34c2010-09-29 10:25:54 +00002413wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002414PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002415 Py_ssize_t *size)
2416{
2417 wchar_t* buffer;
2418 Py_ssize_t buflen;
2419
2420 if (unicode == NULL) {
2421 PyErr_BadInternalCall();
2422 return NULL;
2423 }
2424
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002425 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (buflen == -1)
2427 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002428 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002429 PyErr_NoMemory();
2430 return NULL;
2431 }
2432
Victor Stinner137c34c2010-09-29 10:25:54 +00002433 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2434 if (buffer == NULL) {
2435 PyErr_NoMemory();
2436 return NULL;
2437 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002438 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (buflen == -1)
2440 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441 if (size != NULL)
2442 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002443 return buffer;
2444}
2445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447
Alexander Belopolsky40018472011-02-26 01:02:56 +00002448PyObject *
2449PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002452 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 PyErr_SetString(PyExc_ValueError,
2454 "chr() arg not in range(0x110000)");
2455 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002456 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (ordinal < 256)
2459 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 v = PyUnicode_New(1, ordinal);
2462 if (v == NULL)
2463 return NULL;
2464 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2465 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002466}
2467
Alexander Belopolsky40018472011-02-26 01:02:56 +00002468PyObject *
2469PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002471 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002473 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002474 if (PyUnicode_READY(obj))
2475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 Py_INCREF(obj);
2477 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002478 }
2479 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 /* For a Unicode subtype that's not a Unicode object,
2481 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002482 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002483 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002484 PyErr_Format(PyExc_TypeError,
2485 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002486 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002487 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002488}
2489
Alexander Belopolsky40018472011-02-26 01:02:56 +00002490PyObject *
2491PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002492 const char *encoding,
2493 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002494{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002495 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002496 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 PyErr_BadInternalCall();
2500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002502
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002503 /* Decoding bytes objects is the most common case and should be fast */
2504 if (PyBytes_Check(obj)) {
2505 if (PyBytes_GET_SIZE(obj) == 0) {
2506 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002507 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002508 }
2509 else {
2510 v = PyUnicode_Decode(
2511 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2512 encoding, errors);
2513 }
2514 return v;
2515 }
2516
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002517 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 PyErr_SetString(PyExc_TypeError,
2519 "decoding str is not supported");
2520 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002522
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002523 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2524 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2525 PyErr_Format(PyExc_TypeError,
2526 "coercing to str: need bytes, bytearray "
2527 "or buffer-like object, %.80s found",
2528 Py_TYPE(obj)->tp_name);
2529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002530 }
Tim Petersced69f82003-09-16 20:30:58 +00002531
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002532 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002534 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
Tim Petersced69f82003-09-16 20:30:58 +00002536 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002537 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002538
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002539 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002540 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541}
2542
Victor Stinner600d3be2010-06-10 12:00:55 +00002543/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002544 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2545 1 on success. */
2546static int
2547normalize_encoding(const char *encoding,
2548 char *lower,
2549 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002551 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002552 char *l;
2553 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002555 e = encoding;
2556 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002557 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002558 while (*e) {
2559 if (l == l_end)
2560 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002561 if (Py_ISUPPER(*e)) {
2562 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002563 }
2564 else if (*e == '_') {
2565 *l++ = '-';
2566 e++;
2567 }
2568 else {
2569 *l++ = *e++;
2570 }
2571 }
2572 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002573 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002574}
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002578 Py_ssize_t size,
2579 const char *encoding,
2580 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002581{
2582 PyObject *buffer = NULL, *unicode;
2583 Py_buffer info;
2584 char lower[11]; /* Enough for any encoding shortcut */
2585
2586 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002588
2589 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002590 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002591 if ((strcmp(lower, "utf-8") == 0) ||
2592 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002593 return PyUnicode_DecodeUTF8(s, size, errors);
2594 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002595 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002596 (strcmp(lower, "iso-8859-1") == 0))
2597 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002598#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002599 else if (strcmp(lower, "mbcs") == 0)
2600 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002601#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002602 else if (strcmp(lower, "ascii") == 0)
2603 return PyUnicode_DecodeASCII(s, size, errors);
2604 else if (strcmp(lower, "utf-16") == 0)
2605 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2606 else if (strcmp(lower, "utf-32") == 0)
2607 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002611 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002612 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002613 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002614 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 if (buffer == NULL)
2616 goto onError;
2617 unicode = PyCodec_Decode(buffer, encoding, errors);
2618 if (unicode == NULL)
2619 goto onError;
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002622 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002623 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 Py_DECREF(unicode);
2625 goto onError;
2626 }
2627 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002628#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002629 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_DECREF(unicode);
2631 return NULL;
2632 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002633#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002635
Benjamin Peterson29060642009-01-31 22:14:21 +00002636 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 Py_XDECREF(buffer);
2638 return NULL;
2639}
2640
Alexander Belopolsky40018472011-02-26 01:02:56 +00002641PyObject *
2642PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002643 const char *encoding,
2644 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002645{
2646 PyObject *v;
2647
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 goto onError;
2651 }
2652
2653 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002655
2656 /* Decode via the codec registry */
2657 v = PyCodec_Decode(unicode, encoding, errors);
2658 if (v == NULL)
2659 goto onError;
2660 return v;
2661
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002663 return NULL;
2664}
2665
Alexander Belopolsky40018472011-02-26 01:02:56 +00002666PyObject *
2667PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002668 const char *encoding,
2669 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002670{
2671 PyObject *v;
2672
2673 if (!PyUnicode_Check(unicode)) {
2674 PyErr_BadArgument();
2675 goto onError;
2676 }
2677
2678 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002680
2681 /* Decode via the codec registry */
2682 v = PyCodec_Decode(unicode, encoding, errors);
2683 if (v == NULL)
2684 goto onError;
2685 if (!PyUnicode_Check(v)) {
2686 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002687 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002688 Py_TYPE(v)->tp_name);
2689 Py_DECREF(v);
2690 goto onError;
2691 }
2692 return v;
2693
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002695 return NULL;
2696}
2697
Alexander Belopolsky40018472011-02-26 01:02:56 +00002698PyObject *
2699PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002700 Py_ssize_t size,
2701 const char *encoding,
2702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703{
2704 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 unicode = PyUnicode_FromUnicode(s, size);
2707 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2710 Py_DECREF(unicode);
2711 return v;
2712}
2713
Alexander Belopolsky40018472011-02-26 01:02:56 +00002714PyObject *
2715PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002716 const char *encoding,
2717 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002718{
2719 PyObject *v;
2720
2721 if (!PyUnicode_Check(unicode)) {
2722 PyErr_BadArgument();
2723 goto onError;
2724 }
2725
2726 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002728
2729 /* Encode via the codec registry */
2730 v = PyCodec_Encode(unicode, encoding, errors);
2731 if (v == NULL)
2732 goto onError;
2733 return v;
2734
Benjamin Peterson29060642009-01-31 22:14:21 +00002735 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002736 return NULL;
2737}
2738
Victor Stinnerad158722010-10-27 00:25:46 +00002739PyObject *
2740PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002741{
Victor Stinner99b95382011-07-04 14:23:54 +02002742#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002743 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2744 PyUnicode_GET_SIZE(unicode),
2745 NULL);
2746#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002748#else
Victor Stinner793b5312011-04-27 00:24:21 +02002749 PyInterpreterState *interp = PyThreadState_GET()->interp;
2750 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2751 cannot use it to encode and decode filenames before it is loaded. Load
2752 the Python codec requires to encode at least its own filename. Use the C
2753 version of the locale codec until the codec registry is initialized and
2754 the Python codec is loaded.
2755
2756 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2757 cannot only rely on it: check also interp->fscodec_initialized for
2758 subinterpreters. */
2759 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002760 return PyUnicode_AsEncodedString(unicode,
2761 Py_FileSystemDefaultEncoding,
2762 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002763 }
2764 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002765 /* locale encoding with surrogateescape */
2766 wchar_t *wchar;
2767 char *bytes;
2768 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002769 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002770
2771 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2772 if (wchar == NULL)
2773 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002774 bytes = _Py_wchar2char(wchar, &error_pos);
2775 if (bytes == NULL) {
2776 if (error_pos != (size_t)-1) {
2777 char *errmsg = strerror(errno);
2778 PyObject *exc = NULL;
2779 if (errmsg == NULL)
2780 errmsg = "Py_wchar2char() failed";
2781 raise_encode_exception(&exc,
2782 "filesystemencoding",
2783 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2784 error_pos, error_pos+1,
2785 errmsg);
2786 Py_XDECREF(exc);
2787 }
2788 else
2789 PyErr_NoMemory();
2790 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002791 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002792 }
2793 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002794
2795 bytes_obj = PyBytes_FromString(bytes);
2796 PyMem_Free(bytes);
2797 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002798 }
Victor Stinnerad158722010-10-27 00:25:46 +00002799#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002800}
2801
Alexander Belopolsky40018472011-02-26 01:02:56 +00002802PyObject *
2803PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002804 const char *encoding,
2805 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806{
2807 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002808 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002809
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 }
Fred Drakee4315f52000-05-09 19:53:39 +00002814
Victor Stinner2f283c22011-03-02 01:21:46 +00002815 if (encoding == NULL) {
2816 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002818 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002820 }
Fred Drakee4315f52000-05-09 19:53:39 +00002821
2822 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002823 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002824 if ((strcmp(lower, "utf-8") == 0) ||
2825 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002826 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002827 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002829 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002831 }
Victor Stinner37296e82010-06-10 13:36:23 +00002832 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002833 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002834 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002836#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002837 else if (strcmp(lower, "mbcs") == 0)
2838 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2839 PyUnicode_GET_SIZE(unicode),
2840 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002841#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002842 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002843 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845
2846 /* Encode via the codec registry */
2847 v = PyCodec_Encode(unicode, encoding, errors);
2848 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002849 return NULL;
2850
2851 /* The normal path */
2852 if (PyBytes_Check(v))
2853 return v;
2854
2855 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002856 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002857 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002858 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002859
2860 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2861 "encoder %s returned bytearray instead of bytes",
2862 encoding);
2863 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002864 Py_DECREF(v);
2865 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002866 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002867
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002868 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2869 Py_DECREF(v);
2870 return b;
2871 }
2872
2873 PyErr_Format(PyExc_TypeError,
2874 "encoder did not return a bytes object (type=%.400s)",
2875 Py_TYPE(v)->tp_name);
2876 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002877 return NULL;
2878}
2879
Alexander Belopolsky40018472011-02-26 01:02:56 +00002880PyObject *
2881PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002882 const char *encoding,
2883 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002884{
2885 PyObject *v;
2886
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 goto onError;
2890 }
2891
2892 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002894
2895 /* Encode via the codec registry */
2896 v = PyCodec_Encode(unicode, encoding, errors);
2897 if (v == NULL)
2898 goto onError;
2899 if (!PyUnicode_Check(v)) {
2900 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002901 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002902 Py_TYPE(v)->tp_name);
2903 Py_DECREF(v);
2904 goto onError;
2905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002907
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 return NULL;
2910}
2911
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002912PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002913PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002914 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002915 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2916}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002917
Christian Heimes5894ba72007-11-04 11:43:14 +00002918PyObject*
2919PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2920{
Victor Stinner99b95382011-07-04 14:23:54 +02002921#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002922 return PyUnicode_DecodeMBCS(s, size, NULL);
2923#elif defined(__APPLE__)
2924 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2925#else
Victor Stinner793b5312011-04-27 00:24:21 +02002926 PyInterpreterState *interp = PyThreadState_GET()->interp;
2927 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2928 cannot use it to encode and decode filenames before it is loaded. Load
2929 the Python codec requires to encode at least its own filename. Use the C
2930 version of the locale codec until the codec registry is initialized and
2931 the Python codec is loaded.
2932
2933 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2934 cannot only rely on it: check also interp->fscodec_initialized for
2935 subinterpreters. */
2936 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002937 return PyUnicode_Decode(s, size,
2938 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002939 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002940 }
2941 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002942 /* locale encoding with surrogateescape */
2943 wchar_t *wchar;
2944 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002945 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002946
2947 if (s[size] != '\0' || size != strlen(s)) {
2948 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2949 return NULL;
2950 }
2951
Victor Stinner168e1172010-10-16 23:16:16 +00002952 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002953 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002954 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002955
Victor Stinner168e1172010-10-16 23:16:16 +00002956 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002957 PyMem_Free(wchar);
2958 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002959 }
Victor Stinnerad158722010-10-27 00:25:46 +00002960#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002961}
2962
Martin v. Löwis011e8422009-05-05 04:43:17 +00002963
2964int
2965PyUnicode_FSConverter(PyObject* arg, void* addr)
2966{
2967 PyObject *output = NULL;
2968 Py_ssize_t size;
2969 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002970 if (arg == NULL) {
2971 Py_DECREF(*(PyObject**)addr);
2972 return 1;
2973 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002974 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002975 output = arg;
2976 Py_INCREF(output);
2977 }
2978 else {
2979 arg = PyUnicode_FromObject(arg);
2980 if (!arg)
2981 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002982 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002983 Py_DECREF(arg);
2984 if (!output)
2985 return 0;
2986 if (!PyBytes_Check(output)) {
2987 Py_DECREF(output);
2988 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2989 return 0;
2990 }
2991 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002992 size = PyBytes_GET_SIZE(output);
2993 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002994 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002995 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002996 Py_DECREF(output);
2997 return 0;
2998 }
2999 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003000 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003001}
3002
3003
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003004int
3005PyUnicode_FSDecoder(PyObject* arg, void* addr)
3006{
3007 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003008 if (arg == NULL) {
3009 Py_DECREF(*(PyObject**)addr);
3010 return 1;
3011 }
3012 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 if (PyUnicode_READY(arg))
3014 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003015 output = arg;
3016 Py_INCREF(output);
3017 }
3018 else {
3019 arg = PyBytes_FromObject(arg);
3020 if (!arg)
3021 return 0;
3022 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3023 PyBytes_GET_SIZE(arg));
3024 Py_DECREF(arg);
3025 if (!output)
3026 return 0;
3027 if (!PyUnicode_Check(output)) {
3028 Py_DECREF(output);
3029 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3030 return 0;
3031 }
3032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3034 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003035 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3036 Py_DECREF(output);
3037 return 0;
3038 }
3039 *(PyObject**)addr = output;
3040 return Py_CLEANUP_SUPPORTED;
3041}
3042
3043
Martin v. Löwis5b222132007-06-10 09:51:05 +00003044char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003045PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003046{
Christian Heimesf3863112007-11-22 07:46:41 +00003047 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003048 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3049
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003050 if (!PyUnicode_Check(unicode)) {
3051 PyErr_BadArgument();
3052 return NULL;
3053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003056
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003057 if (PyUnicode_UTF8(unicode) == NULL) {
3058 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3060 if (bytes == NULL)
3061 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003062 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3063 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003064 Py_DECREF(bytes);
3065 return NULL;
3066 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003067 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3068 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003069 Py_DECREF(bytes);
3070 }
3071
3072 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003073 *psize = PyUnicode_UTF8_LENGTH(unicode);
3074 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003075}
3076
3077char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003078PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003080 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3081}
3082
3083#ifdef Py_DEBUG
3084int unicode_as_unicode_calls = 0;
3085#endif
3086
3087
3088Py_UNICODE *
3089PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3090{
3091 PyUnicodeObject *u;
3092 const unsigned char *one_byte;
3093#if SIZEOF_WCHAR_T == 4
3094 const Py_UCS2 *two_bytes;
3095#else
3096 const Py_UCS4 *four_bytes;
3097 const Py_UCS4 *ucs4_end;
3098 Py_ssize_t num_surrogates;
3099#endif
3100 wchar_t *w;
3101 wchar_t *wchar_end;
3102
3103 if (!PyUnicode_Check(unicode)) {
3104 PyErr_BadArgument();
3105 return NULL;
3106 }
3107 u = (PyUnicodeObject*)unicode;
3108 if (_PyUnicode_WSTR(u) == NULL) {
3109 /* Non-ASCII compact unicode object */
3110 assert(_PyUnicode_KIND(u) != 0);
3111 assert(PyUnicode_IS_READY(u));
3112
3113#ifdef Py_DEBUG
3114 ++unicode_as_unicode_calls;
3115#endif
3116
3117 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3118#if SIZEOF_WCHAR_T == 2
3119 four_bytes = PyUnicode_4BYTE_DATA(u);
3120 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3121 num_surrogates = 0;
3122
3123 for (; four_bytes < ucs4_end; ++four_bytes) {
3124 if (*four_bytes > 0xFFFF)
3125 ++num_surrogates;
3126 }
3127
3128 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3129 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3130 if (!_PyUnicode_WSTR(u)) {
3131 PyErr_NoMemory();
3132 return NULL;
3133 }
3134 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3135
3136 w = _PyUnicode_WSTR(u);
3137 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3138 four_bytes = PyUnicode_4BYTE_DATA(u);
3139 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3140 if (*four_bytes > 0xFFFF) {
3141 /* encode surrogate pair in this case */
3142 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3143 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3144 }
3145 else
3146 *w = *four_bytes;
3147
3148 if (w > wchar_end) {
3149 assert(0 && "Miscalculated string end");
3150 }
3151 }
3152 *w = 0;
3153#else
3154 /* sizeof(wchar_t) == 4 */
3155 Py_FatalError("Impossible unicode object state, wstr and str "
3156 "should share memory already.");
3157 return NULL;
3158#endif
3159 }
3160 else {
3161 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3162 (_PyUnicode_LENGTH(u) + 1));
3163 if (!_PyUnicode_WSTR(u)) {
3164 PyErr_NoMemory();
3165 return NULL;
3166 }
3167 if (!PyUnicode_IS_COMPACT_ASCII(u))
3168 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3169 w = _PyUnicode_WSTR(u);
3170 wchar_end = w + _PyUnicode_LENGTH(u);
3171
3172 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3173 one_byte = PyUnicode_1BYTE_DATA(u);
3174 for (; w < wchar_end; ++one_byte, ++w)
3175 *w = *one_byte;
3176 /* null-terminate the wstr */
3177 *w = 0;
3178 }
3179 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3180#if SIZEOF_WCHAR_T == 4
3181 two_bytes = PyUnicode_2BYTE_DATA(u);
3182 for (; w < wchar_end; ++two_bytes, ++w)
3183 *w = *two_bytes;
3184 /* null-terminate the wstr */
3185 *w = 0;
3186#else
3187 /* sizeof(wchar_t) == 2 */
3188 PyObject_FREE(_PyUnicode_WSTR(u));
3189 _PyUnicode_WSTR(u) = NULL;
3190 Py_FatalError("Impossible unicode object state, wstr "
3191 "and str should share memory already.");
3192 return NULL;
3193#endif
3194 }
3195 else {
3196 assert(0 && "This should never happen.");
3197 }
3198 }
3199 }
3200 if (size != NULL)
3201 *size = PyUnicode_WSTR_LENGTH(u);
3202 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003203}
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205Py_UNICODE *
3206PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003208 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209}
3210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211
Alexander Belopolsky40018472011-02-26 01:02:56 +00003212Py_ssize_t
3213PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214{
3215 if (!PyUnicode_Check(unicode)) {
3216 PyErr_BadArgument();
3217 goto onError;
3218 }
3219 return PyUnicode_GET_SIZE(unicode);
3220
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 return -1;
3223}
3224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003225Py_ssize_t
3226PyUnicode_GetLength(PyObject *unicode)
3227{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003228 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003229 PyErr_BadArgument();
3230 return -1;
3231 }
3232
3233 return PyUnicode_GET_LENGTH(unicode);
3234}
3235
3236Py_UCS4
3237PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3238{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003239 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3240 PyErr_BadArgument();
3241 return (Py_UCS4)-1;
3242 }
3243 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3244 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003245 return (Py_UCS4)-1;
3246 }
3247 return PyUnicode_READ_CHAR(unicode, index);
3248}
3249
3250int
3251PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3252{
3253 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003254 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255 return -1;
3256 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003257 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3258 PyErr_SetString(PyExc_IndexError, "string index out of range");
3259 return -1;
3260 }
3261 if (_PyUnicode_Dirty(unicode))
3262 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003263 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3264 index, ch);
3265 return 0;
3266}
3267
Alexander Belopolsky40018472011-02-26 01:02:56 +00003268const char *
3269PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003270{
Victor Stinner42cb4622010-09-01 19:39:01 +00003271 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003272}
3273
Victor Stinner554f3f02010-06-16 23:33:54 +00003274/* create or adjust a UnicodeDecodeError */
3275static void
3276make_decode_exception(PyObject **exceptionObject,
3277 const char *encoding,
3278 const char *input, Py_ssize_t length,
3279 Py_ssize_t startpos, Py_ssize_t endpos,
3280 const char *reason)
3281{
3282 if (*exceptionObject == NULL) {
3283 *exceptionObject = PyUnicodeDecodeError_Create(
3284 encoding, input, length, startpos, endpos, reason);
3285 }
3286 else {
3287 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3288 goto onError;
3289 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3290 goto onError;
3291 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3292 goto onError;
3293 }
3294 return;
3295
3296onError:
3297 Py_DECREF(*exceptionObject);
3298 *exceptionObject = NULL;
3299}
3300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301/* error handling callback helper:
3302 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003303 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 and adjust various state variables.
3305 return 0 on success, -1 on error
3306*/
3307
Alexander Belopolsky40018472011-02-26 01:02:56 +00003308static int
3309unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003310 const char *encoding, const char *reason,
3311 const char **input, const char **inend, Py_ssize_t *startinpos,
3312 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3313 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003315 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316
3317 PyObject *restuple = NULL;
3318 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003320 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003321 Py_ssize_t requiredsize;
3322 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003323 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003324 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003325 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 int res = -1;
3327
3328 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003329 *errorHandler = PyCodec_LookupError(errors);
3330 if (*errorHandler == NULL)
3331 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 }
3333
Victor Stinner554f3f02010-06-16 23:33:54 +00003334 make_decode_exception(exceptionObject,
3335 encoding,
3336 *input, *inend - *input,
3337 *startinpos, *endinpos,
3338 reason);
3339 if (*exceptionObject == NULL)
3340 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341
3342 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3343 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003346 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 }
3349 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003350 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003351
3352 /* Copy back the bytes variables, which might have been modified by the
3353 callback */
3354 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3355 if (!inputobj)
3356 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003357 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003359 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003360 *input = PyBytes_AS_STRING(inputobj);
3361 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003362 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003363 /* we can DECREF safely, as the exception has another reference,
3364 so the object won't go away. */
3365 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003369 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3371 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373
3374 /* need more space? (at least enough for what we
3375 have+the replacement+the rest of the string (starting
3376 at the new input position), so we won't have to check space
3377 when there are no errors in the rest of the string) */
3378 repptr = PyUnicode_AS_UNICODE(repunicode);
3379 repsize = PyUnicode_GET_SIZE(repunicode);
3380 requiredsize = *outpos + repsize + insize-newpos;
3381 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 if (requiredsize<2*outsize)
3383 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003384 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 goto onError;
3386 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 }
3388 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003389 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 Py_UNICODE_COPY(*outptr, repptr, repsize);
3391 *outptr += repsize;
3392 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 /* we made it! */
3395 res = 0;
3396
Benjamin Peterson29060642009-01-31 22:14:21 +00003397 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 Py_XDECREF(restuple);
3399 return res;
3400}
3401
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003402/* --- UTF-7 Codec -------------------------------------------------------- */
3403
Antoine Pitrou244651a2009-05-04 18:56:13 +00003404/* See RFC2152 for details. We encode conservatively and decode liberally. */
3405
3406/* Three simple macros defining base-64. */
3407
3408/* Is c a base-64 character? */
3409
3410#define IS_BASE64(c) \
3411 (((c) >= 'A' && (c) <= 'Z') || \
3412 ((c) >= 'a' && (c) <= 'z') || \
3413 ((c) >= '0' && (c) <= '9') || \
3414 (c) == '+' || (c) == '/')
3415
3416/* given that c is a base-64 character, what is its base-64 value? */
3417
3418#define FROM_BASE64(c) \
3419 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3420 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3421 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3422 (c) == '+' ? 62 : 63)
3423
3424/* What is the base-64 character of the bottom 6 bits of n? */
3425
3426#define TO_BASE64(n) \
3427 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3428
3429/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3430 * decoded as itself. We are permissive on decoding; the only ASCII
3431 * byte not decoding to itself is the + which begins a base64
3432 * string. */
3433
3434#define DECODE_DIRECT(c) \
3435 ((c) <= 127 && (c) != '+')
3436
3437/* The UTF-7 encoder treats ASCII characters differently according to
3438 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3439 * the above). See RFC2152. This array identifies these different
3440 * sets:
3441 * 0 : "Set D"
3442 * alphanumeric and '(),-./:?
3443 * 1 : "Set O"
3444 * !"#$%&*;<=>@[]^_`{|}
3445 * 2 : "whitespace"
3446 * ht nl cr sp
3447 * 3 : special (must be base64 encoded)
3448 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3449 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003450
Tim Petersced69f82003-09-16 20:30:58 +00003451static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003452char utf7_category[128] = {
3453/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3454 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3455/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3456 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3457/* sp ! " # $ % & ' ( ) * + , - . / */
3458 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3459/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3461/* @ A B C D E F G H I J K L M N O */
3462 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3463/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3465/* ` a b c d e f g h i j k l m n o */
3466 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467/* p q r s t u v w x y z { | } ~ del */
3468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003469};
3470
Antoine Pitrou244651a2009-05-04 18:56:13 +00003471/* ENCODE_DIRECT: this character should be encoded as itself. The
3472 * answer depends on whether we are encoding set O as itself, and also
3473 * on whether we are encoding whitespace as itself. RFC2152 makes it
3474 * clear that the answers to these questions vary between
3475 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003476
Antoine Pitrou244651a2009-05-04 18:56:13 +00003477#define ENCODE_DIRECT(c, directO, directWS) \
3478 ((c) < 128 && (c) > 0 && \
3479 ((utf7_category[(c)] == 0) || \
3480 (directWS && (utf7_category[(c)] == 2)) || \
3481 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003482
Alexander Belopolsky40018472011-02-26 01:02:56 +00003483PyObject *
3484PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003485 Py_ssize_t size,
3486 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003487{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003488 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3489}
3490
Antoine Pitrou244651a2009-05-04 18:56:13 +00003491/* The decoder. The only state we preserve is our read position,
3492 * i.e. how many characters we have consumed. So if we end in the
3493 * middle of a shift sequence we have to back off the read position
3494 * and the output to the beginning of the sequence, otherwise we lose
3495 * all the shift state (seen bits, number of bits seen, high
3496 * surrogate). */
3497
Alexander Belopolsky40018472011-02-26 01:02:56 +00003498PyObject *
3499PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003500 Py_ssize_t size,
3501 const char *errors,
3502 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 Py_ssize_t startinpos;
3506 Py_ssize_t endinpos;
3507 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003508 const char *e;
3509 PyUnicodeObject *unicode;
3510 Py_UNICODE *p;
3511 const char *errmsg = "";
3512 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003513 Py_UNICODE *shiftOutStart;
3514 unsigned int base64bits = 0;
3515 unsigned long base64buffer = 0;
3516 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 PyObject *errorHandler = NULL;
3518 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003519
3520 unicode = _PyUnicode_New(size);
3521 if (!unicode)
3522 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003523 if (size == 0) {
3524 if (consumed)
3525 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003527 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003529 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003530 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003531 e = s + size;
3532
3533 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003536 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003537
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538 if (inShift) { /* in a base-64 section */
3539 if (IS_BASE64(ch)) { /* consume a base-64 character */
3540 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3541 base64bits += 6;
3542 s++;
3543 if (base64bits >= 16) {
3544 /* we have enough bits for a UTF-16 value */
3545 Py_UNICODE outCh = (Py_UNICODE)
3546 (base64buffer >> (base64bits-16));
3547 base64bits -= 16;
3548 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3549 if (surrogate) {
3550 /* expecting a second surrogate */
3551 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3552#ifdef Py_UNICODE_WIDE
3553 *p++ = (((surrogate & 0x3FF)<<10)
3554 | (outCh & 0x3FF)) + 0x10000;
3555#else
3556 *p++ = surrogate;
3557 *p++ = outCh;
3558#endif
3559 surrogate = 0;
3560 }
3561 else {
3562 surrogate = 0;
3563 errmsg = "second surrogate missing";
3564 goto utf7Error;
3565 }
3566 }
3567 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3568 /* first surrogate */
3569 surrogate = outCh;
3570 }
3571 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3572 errmsg = "unexpected second surrogate";
3573 goto utf7Error;
3574 }
3575 else {
3576 *p++ = outCh;
3577 }
3578 }
3579 }
3580 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003581 inShift = 0;
3582 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003583 if (surrogate) {
3584 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003585 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003587 if (base64bits > 0) { /* left-over bits */
3588 if (base64bits >= 6) {
3589 /* We've seen at least one base-64 character */
3590 errmsg = "partial character in shift sequence";
3591 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003592 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003593 else {
3594 /* Some bits remain; they should be zero */
3595 if (base64buffer != 0) {
3596 errmsg = "non-zero padding bits in shift sequence";
3597 goto utf7Error;
3598 }
3599 }
3600 }
3601 if (ch != '-') {
3602 /* '-' is absorbed; other terminating
3603 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 *p++ = ch;
3605 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 }
3607 }
3608 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003610 s++; /* consume '+' */
3611 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003612 s++;
3613 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003614 }
3615 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617 shiftOutStart = p;
3618 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003619 }
3620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003621 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622 *p++ = ch;
3623 s++;
3624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003625 else {
3626 startinpos = s-starts;
3627 s++;
3628 errmsg = "unexpected special character";
3629 goto utf7Error;
3630 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003631 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003632utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 outpos = p-PyUnicode_AS_UNICODE(unicode);
3634 endinpos = s-starts;
3635 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 errors, &errorHandler,
3637 "utf7", errmsg,
3638 &starts, &e, &startinpos, &endinpos, &exc, &s,
3639 &unicode, &outpos, &p))
3640 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003641 }
3642
Antoine Pitrou244651a2009-05-04 18:56:13 +00003643 /* end of string */
3644
3645 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3646 /* if we're in an inconsistent state, that's an error */
3647 if (surrogate ||
3648 (base64bits >= 6) ||
3649 (base64bits > 0 && base64buffer != 0)) {
3650 outpos = p-PyUnicode_AS_UNICODE(unicode);
3651 endinpos = size;
3652 if (unicode_decode_call_errorhandler(
3653 errors, &errorHandler,
3654 "utf7", "unterminated shift sequence",
3655 &starts, &e, &startinpos, &endinpos, &exc, &s,
3656 &unicode, &outpos, &p))
3657 goto onError;
3658 if (s < e)
3659 goto restart;
3660 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003661 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003662
3663 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003664 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003665 if (inShift) {
3666 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003667 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668 }
3669 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003670 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003671 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673
Victor Stinnerfe226c02011-10-03 03:52:20 +02003674 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675 goto onError;
3676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 Py_XDECREF(errorHandler);
3678 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003679#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003680 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 Py_DECREF(unicode);
3682 return NULL;
3683 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003684#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003685 return (PyObject *)unicode;
3686
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 Py_XDECREF(errorHandler);
3689 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690 Py_DECREF(unicode);
3691 return NULL;
3692}
3693
3694
Alexander Belopolsky40018472011-02-26 01:02:56 +00003695PyObject *
3696PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003697 Py_ssize_t size,
3698 int base64SetO,
3699 int base64WhiteSpace,
3700 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003702 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003703 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003704 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003705 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003706 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003707 unsigned int base64bits = 0;
3708 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003709 char * out;
3710 char * start;
3711
3712 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003714
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003715 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003716 return PyErr_NoMemory();
3717
Antoine Pitrou244651a2009-05-04 18:56:13 +00003718 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719 if (v == NULL)
3720 return NULL;
3721
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003722 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003723 for (;i < size; ++i) {
3724 Py_UNICODE ch = s[i];
3725
Antoine Pitrou244651a2009-05-04 18:56:13 +00003726 if (inShift) {
3727 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3728 /* shifting out */
3729 if (base64bits) { /* output remaining bits */
3730 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3731 base64buffer = 0;
3732 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733 }
3734 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003735 /* Characters not in the BASE64 set implicitly unshift the sequence
3736 so no '-' is required, except if the character is itself a '-' */
3737 if (IS_BASE64(ch) || ch == '-') {
3738 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003739 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003740 *out++ = (char) ch;
3741 }
3742 else {
3743 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003744 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003746 else { /* not in a shift sequence */
3747 if (ch == '+') {
3748 *out++ = '+';
3749 *out++ = '-';
3750 }
3751 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3752 *out++ = (char) ch;
3753 }
3754 else {
3755 *out++ = '+';
3756 inShift = 1;
3757 goto encode_char;
3758 }
3759 }
3760 continue;
3761encode_char:
3762#ifdef Py_UNICODE_WIDE
3763 if (ch >= 0x10000) {
3764 /* code first surrogate */
3765 base64bits += 16;
3766 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3767 while (base64bits >= 6) {
3768 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3769 base64bits -= 6;
3770 }
3771 /* prepare second surrogate */
3772 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3773 }
3774#endif
3775 base64bits += 16;
3776 base64buffer = (base64buffer << 16) | ch;
3777 while (base64bits >= 6) {
3778 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3779 base64bits -= 6;
3780 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003782 if (base64bits)
3783 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3784 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003786 if (_PyBytes_Resize(&v, out - start) < 0)
3787 return NULL;
3788 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789}
3790
Antoine Pitrou244651a2009-05-04 18:56:13 +00003791#undef IS_BASE64
3792#undef FROM_BASE64
3793#undef TO_BASE64
3794#undef DECODE_DIRECT
3795#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797/* --- UTF-8 Codec -------------------------------------------------------- */
3798
Tim Petersced69f82003-09-16 20:30:58 +00003799static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003801 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3802 illegal prefix. See RFC 3629 for details */
3803 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3804 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003805 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3807 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3809 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003810 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3811 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3815 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3816 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3817 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3818 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819};
3820
Alexander Belopolsky40018472011-02-26 01:02:56 +00003821PyObject *
3822PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003823 Py_ssize_t size,
3824 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825{
Walter Dörwald69652032004-09-07 20:24:22 +00003826 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3827}
3828
Antoine Pitrouab868312009-01-10 15:40:25 +00003829/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3830#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3831
3832/* Mask to quickly check whether a C 'long' contains a
3833 non-ASCII, UTF8-encoded char. */
3834#if (SIZEOF_LONG == 8)
3835# define ASCII_CHAR_MASK 0x8080808080808080L
3836#elif (SIZEOF_LONG == 4)
3837# define ASCII_CHAR_MASK 0x80808080L
3838#else
3839# error C 'long' size should be either 4 or 8!
3840#endif
3841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842/* Scans a UTF-8 string and returns the maximum character to be expected,
3843 the size of the decoded unicode string and if any major errors were
3844 encountered.
3845
3846 This function does check basic UTF-8 sanity, it does however NOT CHECK
3847 if the string contains surrogates, and if all continuation bytes are
3848 within the correct ranges, these checks are performed in
3849 PyUnicode_DecodeUTF8Stateful.
3850
3851 If it sets has_errors to 1, it means the value of unicode_size and max_char
3852 will be bogus and you should not rely on useful information in them.
3853 */
3854static Py_UCS4
3855utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3856 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3857 int *has_errors)
3858{
3859 Py_ssize_t n;
3860 Py_ssize_t char_count = 0;
3861 Py_UCS4 max_char = 127, new_max;
3862 Py_UCS4 upper_bound;
3863 const unsigned char *p = (const unsigned char *)s;
3864 const unsigned char *end = p + string_size;
3865 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3866 int err = 0;
3867
3868 for (; p < end && !err; ++p, ++char_count) {
3869 /* Only check value if it's not a ASCII char... */
3870 if (*p < 0x80) {
3871 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3872 an explanation. */
3873 if (!((size_t) p & LONG_PTR_MASK)) {
3874 /* Help register allocation */
3875 register const unsigned char *_p = p;
3876 while (_p < aligned_end) {
3877 unsigned long value = *(unsigned long *) _p;
3878 if (value & ASCII_CHAR_MASK)
3879 break;
3880 _p += SIZEOF_LONG;
3881 char_count += SIZEOF_LONG;
3882 }
3883 p = _p;
3884 if (p == end)
3885 break;
3886 }
3887 }
3888 if (*p >= 0x80) {
3889 n = utf8_code_length[*p];
3890 new_max = max_char;
3891 switch (n) {
3892 /* invalid start byte */
3893 case 0:
3894 err = 1;
3895 break;
3896 case 2:
3897 /* Code points between 0x00FF and 0x07FF inclusive.
3898 Approximate the upper bound of the code point,
3899 if this flips over 255 we can be sure it will be more
3900 than 255 and the string will need 2 bytes per code coint,
3901 if it stays under or equal to 255, we can be sure 1 byte
3902 is enough.
3903 ((*p & 0b00011111) << 6) | 0b00111111 */
3904 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3905 if (max_char < upper_bound)
3906 new_max = upper_bound;
3907 /* Ensure we track at least that we left ASCII space. */
3908 if (new_max < 128)
3909 new_max = 128;
3910 break;
3911 case 3:
3912 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3913 always > 255 and <= 65535 and will always need 2 bytes. */
3914 if (max_char < 65535)
3915 new_max = 65535;
3916 break;
3917 case 4:
3918 /* Code point will be above 0xFFFF for sure in this case. */
3919 new_max = 65537;
3920 break;
3921 /* Internal error, this should be caught by the first if */
3922 case 1:
3923 default:
3924 assert(0 && "Impossible case in utf8_max_char_and_size");
3925 err = 1;
3926 }
3927 /* Instead of number of overall bytes for this code point,
3928 n containts the number of following bytes: */
3929 --n;
3930 /* Check if the follow up chars are all valid continuation bytes */
3931 if (n >= 1) {
3932 const unsigned char *cont;
3933 if ((p + n) >= end) {
3934 if (consumed == 0)
3935 /* incomplete data, non-incremental decoding */
3936 err = 1;
3937 break;
3938 }
3939 for (cont = p + 1; cont < (p + n); ++cont) {
3940 if ((*cont & 0xc0) != 0x80) {
3941 err = 1;
3942 break;
3943 }
3944 }
3945 p += n;
3946 }
3947 else
3948 err = 1;
3949 max_char = new_max;
3950 }
3951 }
3952
3953 if (unicode_size)
3954 *unicode_size = char_count;
3955 if (has_errors)
3956 *has_errors = err;
3957 return max_char;
3958}
3959
3960/* Similar to PyUnicode_WRITE but can also write into wstr field
3961 of the legacy unicode representation */
3962#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3963 do { \
3964 const int k_ = (kind); \
3965 if (k_ == PyUnicode_WCHAR_KIND) \
3966 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3967 else if (k_ == PyUnicode_1BYTE_KIND) \
3968 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3969 else if (k_ == PyUnicode_2BYTE_KIND) \
3970 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3971 else \
3972 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3973 } while (0)
3974
Alexander Belopolsky40018472011-02-26 01:02:56 +00003975PyObject *
3976PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 Py_ssize_t size,
3978 const char *errors,
3979 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003983 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t startinpos;
3985 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003986 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003988 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 PyObject *errorHandler = NULL;
3990 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 Py_UCS4 maxchar = 0;
3992 Py_ssize_t unicode_size;
3993 Py_ssize_t i;
3994 int kind;
3995 void *data;
3996 int has_errors;
3997 Py_UNICODE *error_outptr;
3998#if SIZEOF_WCHAR_T == 2
3999 Py_ssize_t wchar_offset = 0;
4000#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001
Walter Dörwald69652032004-09-07 20:24:22 +00004002 if (size == 0) {
4003 if (consumed)
4004 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4008 consumed, &has_errors);
4009 if (has_errors) {
4010 unicode = _PyUnicode_New(size);
4011 if (!unicode)
4012 return NULL;
4013 kind = PyUnicode_WCHAR_KIND;
4014 data = PyUnicode_AS_UNICODE(unicode);
4015 assert(data != NULL);
4016 }
4017 else {
4018 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4019 if (!unicode)
4020 return NULL;
4021 /* When the string is ASCII only, just use memcpy and return.
4022 unicode_size may be != size if there is an incomplete UTF-8
4023 sequence at the end of the ASCII block. */
4024 if (maxchar < 128 && size == unicode_size) {
4025 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4026 return (PyObject *)unicode;
4027 }
4028 kind = PyUnicode_KIND(unicode);
4029 data = PyUnicode_DATA(unicode);
4030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004034 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
4036 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004037 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
4039 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004040 /* Fast path for runs of ASCII characters. Given that common UTF-8
4041 input will consist of an overwhelming majority of ASCII
4042 characters, we try to optimize for this case by checking
4043 as many characters as a C 'long' can contain.
4044 First, check if we can do an aligned read, as most CPUs have
4045 a penalty for unaligned reads.
4046 */
4047 if (!((size_t) s & LONG_PTR_MASK)) {
4048 /* Help register allocation */
4049 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004051 while (_s < aligned_end) {
4052 /* Read a whole long at a time (either 4 or 8 bytes),
4053 and do a fast unrolled copy if it only contains ASCII
4054 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 unsigned long value = *(unsigned long *) _s;
4056 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004057 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4059 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4060 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4061 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004062#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4064 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4065 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4066 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004067#endif
4068 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004070 }
4071 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004073 if (s == e)
4074 break;
4075 ch = (unsigned char)*s;
4076 }
4077 }
4078
4079 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 s++;
4082 continue;
4083 }
4084
4085 n = utf8_code_length[ch];
4086
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004087 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 if (consumed)
4089 break;
4090 else {
4091 errmsg = "unexpected end of data";
4092 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004093 endinpos = startinpos+1;
4094 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4095 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 goto utf8Error;
4097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099
4100 switch (n) {
4101
4102 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004103 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 startinpos = s-starts;
4105 endinpos = startinpos+1;
4106 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107
4108 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004109 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 startinpos = s-starts;
4111 endinpos = startinpos+1;
4112 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
4114 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004115 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004116 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004118 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 goto utf8Error;
4120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004122 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 break;
4125
4126 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004127 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4128 will result in surrogates in range d800-dfff. Surrogates are
4129 not valid UTF-8 so they are rejected.
4130 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4131 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004132 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004133 (s[2] & 0xc0) != 0x80 ||
4134 ((unsigned char)s[0] == 0xE0 &&
4135 (unsigned char)s[1] < 0xA0) ||
4136 ((unsigned char)s[0] == 0xED &&
4137 (unsigned char)s[1] > 0x9F)) {
4138 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004140 endinpos = startinpos + 1;
4141
4142 /* if s[1] first two bits are 1 and 0, then the invalid
4143 continuation byte is s[2], so increment endinpos by 1,
4144 if not, s[1] is invalid and endinpos doesn't need to
4145 be incremented. */
4146 if ((s[1] & 0xC0) == 0x80)
4147 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 goto utf8Error;
4149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004151 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004153 break;
4154
4155 case 4:
4156 if ((s[1] & 0xc0) != 0x80 ||
4157 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004158 (s[3] & 0xc0) != 0x80 ||
4159 ((unsigned char)s[0] == 0xF0 &&
4160 (unsigned char)s[1] < 0x90) ||
4161 ((unsigned char)s[0] == 0xF4 &&
4162 (unsigned char)s[1] > 0x8F)) {
4163 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004165 endinpos = startinpos + 1;
4166 if ((s[1] & 0xC0) == 0x80) {
4167 endinpos++;
4168 if ((s[2] & 0xC0) == 0x80)
4169 endinpos++;
4170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 goto utf8Error;
4172 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004173 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004174 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4175 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 /* If the string is flexible or we have native UCS-4, write
4178 directly.. */
4179 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4180 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 else {
4183 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 /* translate from 10000..10FFFF to 0..FFFF */
4186 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 /* high surrogate = top 10 bits added to D800 */
4189 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4190 (Py_UNICODE)(0xD800 + (ch >> 10)));
4191
4192 /* low surrogate = bottom 10 bits added to DC00 */
4193 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4194 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4195 }
4196#if SIZEOF_WCHAR_T == 2
4197 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004198#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 }
4201 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004203
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 /* If this is not yet a resizable string, make it one.. */
4206 if (kind != PyUnicode_WCHAR_KIND) {
4207 const Py_UNICODE *u;
4208 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4209 if (!new_unicode)
4210 goto onError;
4211 u = PyUnicode_AsUnicode((PyObject *)unicode);
4212 if (!u)
4213 goto onError;
4214#if SIZEOF_WCHAR_T == 2
4215 i += wchar_offset;
4216#endif
4217 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4218 Py_DECREF(unicode);
4219 unicode = new_unicode;
4220 kind = 0;
4221 data = PyUnicode_AS_UNICODE(new_unicode);
4222 assert(data != NULL);
4223 }
4224 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 if (unicode_decode_call_errorhandler(
4226 errors, &errorHandler,
4227 "utf8", errmsg,
4228 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004229 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231 /* Update data because unicode_decode_call_errorhandler might have
4232 re-created or resized the unicode object. */
4233 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 /* Ensure the unicode_size calculation above was correct: */
4237 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4238
Walter Dörwald69652032004-09-07 20:24:22 +00004239 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 /* Adjust length and ready string when it contained errors and
4243 is of the old resizable kind. */
4244 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004245 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 goto onError;
4247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 Py_XDECREF(errorHandler);
4250 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004251#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004252 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004253 Py_DECREF(unicode);
4254 return NULL;
4255 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004256#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 return (PyObject *)unicode;
4258
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 Py_XDECREF(errorHandler);
4261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 Py_DECREF(unicode);
4263 return NULL;
4264}
4265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004267
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004268#ifdef __APPLE__
4269
4270/* Simplified UTF-8 decoder using surrogateescape error handler,
4271 used to decode the command line arguments on Mac OS X. */
4272
4273wchar_t*
4274_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4275{
4276 int n;
4277 const char *e;
4278 wchar_t *unicode, *p;
4279
4280 /* Note: size will always be longer than the resulting Unicode
4281 character count */
4282 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4283 PyErr_NoMemory();
4284 return NULL;
4285 }
4286 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4287 if (!unicode)
4288 return NULL;
4289
4290 /* Unpack UTF-8 encoded data */
4291 p = unicode;
4292 e = s + size;
4293 while (s < e) {
4294 Py_UCS4 ch = (unsigned char)*s;
4295
4296 if (ch < 0x80) {
4297 *p++ = (wchar_t)ch;
4298 s++;
4299 continue;
4300 }
4301
4302 n = utf8_code_length[ch];
4303 if (s + n > e) {
4304 goto surrogateescape;
4305 }
4306
4307 switch (n) {
4308 case 0:
4309 case 1:
4310 goto surrogateescape;
4311
4312 case 2:
4313 if ((s[1] & 0xc0) != 0x80)
4314 goto surrogateescape;
4315 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4316 assert ((ch > 0x007F) && (ch <= 0x07FF));
4317 *p++ = (wchar_t)ch;
4318 break;
4319
4320 case 3:
4321 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4322 will result in surrogates in range d800-dfff. Surrogates are
4323 not valid UTF-8 so they are rejected.
4324 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4325 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4326 if ((s[1] & 0xc0) != 0x80 ||
4327 (s[2] & 0xc0) != 0x80 ||
4328 ((unsigned char)s[0] == 0xE0 &&
4329 (unsigned char)s[1] < 0xA0) ||
4330 ((unsigned char)s[0] == 0xED &&
4331 (unsigned char)s[1] > 0x9F)) {
4332
4333 goto surrogateescape;
4334 }
4335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4336 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004337 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004338 break;
4339
4340 case 4:
4341 if ((s[1] & 0xc0) != 0x80 ||
4342 (s[2] & 0xc0) != 0x80 ||
4343 (s[3] & 0xc0) != 0x80 ||
4344 ((unsigned char)s[0] == 0xF0 &&
4345 (unsigned char)s[1] < 0x90) ||
4346 ((unsigned char)s[0] == 0xF4 &&
4347 (unsigned char)s[1] > 0x8F)) {
4348 goto surrogateescape;
4349 }
4350 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4351 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4352 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4353
4354#if SIZEOF_WCHAR_T == 4
4355 *p++ = (wchar_t)ch;
4356#else
4357 /* compute and append the two surrogates: */
4358
4359 /* translate from 10000..10FFFF to 0..FFFF */
4360 ch -= 0x10000;
4361
4362 /* high surrogate = top 10 bits added to D800 */
4363 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4364
4365 /* low surrogate = bottom 10 bits added to DC00 */
4366 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4367#endif
4368 break;
4369 }
4370 s += n;
4371 continue;
4372
4373 surrogateescape:
4374 *p++ = 0xDC00 + ch;
4375 s++;
4376 }
4377 *p = L'\0';
4378 return unicode;
4379}
4380
4381#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383/* Primary internal function which creates utf8 encoded bytes objects.
4384
4385 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004386 and allocate exactly as much space needed at the end. Else allocate the
4387 maximum possible needed (4 result bytes per Unicode character), and return
4388 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004389*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004390PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392{
Tim Peters602f7402002-04-27 18:03:26 +00004393#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004394
Guido van Rossum98297ee2007-11-06 21:34:58 +00004395 Py_ssize_t i; /* index into s of next input byte */
4396 PyObject *result; /* result string object */
4397 char *p; /* next free byte in output buffer */
4398 Py_ssize_t nallocated; /* number of result bytes allocated */
4399 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004400 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004401 PyObject *errorHandler = NULL;
4402 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004403 int kind;
4404 void *data;
4405 Py_ssize_t size;
4406 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4407#if SIZEOF_WCHAR_T == 2
4408 Py_ssize_t wchar_offset = 0;
4409#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004411 if (!PyUnicode_Check(unicode)) {
4412 PyErr_BadArgument();
4413 return NULL;
4414 }
4415
4416 if (PyUnicode_READY(unicode) == -1)
4417 return NULL;
4418
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004419 if (PyUnicode_UTF8(unicode))
4420 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4421 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004422
4423 kind = PyUnicode_KIND(unicode);
4424 data = PyUnicode_DATA(unicode);
4425 size = PyUnicode_GET_LENGTH(unicode);
4426
Tim Peters602f7402002-04-27 18:03:26 +00004427 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428
Tim Peters602f7402002-04-27 18:03:26 +00004429 if (size <= MAX_SHORT_UNICHARS) {
4430 /* Write into the stack buffer; nallocated can't overflow.
4431 * At the end, we'll allocate exactly as much heap space as it
4432 * turns out we need.
4433 */
4434 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004435 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004436 p = stackbuf;
4437 }
4438 else {
4439 /* Overallocate on the heap, and give the excess back at the end. */
4440 nallocated = size * 4;
4441 if (nallocated / 4 != size) /* overflow! */
4442 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004443 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004444 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004445 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004446 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004447 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004448
Tim Peters602f7402002-04-27 18:03:26 +00004449 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004451
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004452 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004453 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004455
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004457 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004458 *p++ = (char)(0xc0 | (ch >> 6));
4459 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004460 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004461 Py_ssize_t newpos;
4462 PyObject *rep;
4463 Py_ssize_t repsize, k, startpos;
4464 startpos = i-1;
4465#if SIZEOF_WCHAR_T == 2
4466 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004467#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004468 rep = unicode_encode_call_errorhandler(
4469 errors, &errorHandler, "utf-8", "surrogates not allowed",
4470 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4471 &exc, startpos, startpos+1, &newpos);
4472 if (!rep)
4473 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 if (PyBytes_Check(rep))
4476 repsize = PyBytes_GET_SIZE(rep);
4477 else
4478 repsize = PyUnicode_GET_SIZE(rep);
4479
4480 if (repsize > 4) {
4481 Py_ssize_t offset;
4482
4483 if (result == NULL)
4484 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004485 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004486 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004488 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4489 /* integer overflow */
4490 PyErr_NoMemory();
4491 goto error;
4492 }
4493 nallocated += repsize - 4;
4494 if (result != NULL) {
4495 if (_PyBytes_Resize(&result, nallocated) < 0)
4496 goto error;
4497 } else {
4498 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004499 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 goto error;
4501 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4502 }
4503 p = PyBytes_AS_STRING(result) + offset;
4504 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004506 if (PyBytes_Check(rep)) {
4507 char *prep = PyBytes_AS_STRING(rep);
4508 for(k = repsize; k > 0; k--)
4509 *p++ = *prep++;
4510 } else /* rep is unicode */ {
4511 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4512 Py_UNICODE c;
4513
4514 for(k=0; k<repsize; k++) {
4515 c = prep[k];
4516 if (0x80 <= c) {
4517 raise_encode_exception(&exc, "utf-8",
4518 PyUnicode_AS_UNICODE(unicode),
4519 size, i-1, i,
4520 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004521 goto error;
4522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004523 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004524 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004527 } else if (ch < 0x10000) {
4528 *p++ = (char)(0xe0 | (ch >> 12));
4529 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4530 *p++ = (char)(0x80 | (ch & 0x3f));
4531 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004532 /* Encode UCS4 Unicode ordinals */
4533 *p++ = (char)(0xf0 | (ch >> 18));
4534 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4535 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4536 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537#if SIZEOF_WCHAR_T == 2
4538 wchar_offset++;
4539#endif
Tim Peters602f7402002-04-27 18:03:26 +00004540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004542
Guido van Rossum98297ee2007-11-06 21:34:58 +00004543 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004546 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004547 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004548 }
4549 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004550 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004551 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004552 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004553 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004556 Py_XDECREF(errorHandler);
4557 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004558 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 error:
4560 Py_XDECREF(errorHandler);
4561 Py_XDECREF(exc);
4562 Py_XDECREF(result);
4563 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004564
Tim Peters602f7402002-04-27 18:03:26 +00004565#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566}
4567
Alexander Belopolsky40018472011-02-26 01:02:56 +00004568PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004569PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4570 Py_ssize_t size,
4571 const char *errors)
4572{
4573 PyObject *v, *unicode;
4574
4575 unicode = PyUnicode_FromUnicode(s, size);
4576 if (unicode == NULL)
4577 return NULL;
4578 v = _PyUnicode_AsUTF8String(unicode, errors);
4579 Py_DECREF(unicode);
4580 return v;
4581}
4582
4583PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004584PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587}
4588
Walter Dörwald41980ca2007-08-16 21:55:45 +00004589/* --- UTF-32 Codec ------------------------------------------------------- */
4590
4591PyObject *
4592PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 Py_ssize_t size,
4594 const char *errors,
4595 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004596{
4597 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4598}
4599
4600PyObject *
4601PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 Py_ssize_t size,
4603 const char *errors,
4604 int *byteorder,
4605 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004606{
4607 const char *starts = s;
4608 Py_ssize_t startinpos;
4609 Py_ssize_t endinpos;
4610 Py_ssize_t outpos;
4611 PyUnicodeObject *unicode;
4612 Py_UNICODE *p;
4613#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004614 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004615 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004616#else
4617 const int pairs = 0;
4618#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004619 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004620 int bo = 0; /* assume native ordering by default */
4621 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004622 /* Offsets from q for retrieving bytes in the right order. */
4623#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4624 int iorder[] = {0, 1, 2, 3};
4625#else
4626 int iorder[] = {3, 2, 1, 0};
4627#endif
4628 PyObject *errorHandler = NULL;
4629 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004630
Walter Dörwald41980ca2007-08-16 21:55:45 +00004631 q = (unsigned char *)s;
4632 e = q + size;
4633
4634 if (byteorder)
4635 bo = *byteorder;
4636
4637 /* Check for BOM marks (U+FEFF) in the input and adjust current
4638 byte order setting accordingly. In native mode, the leading BOM
4639 mark is skipped, in all other modes, it is copied to the output
4640 stream as-is (giving a ZWNBSP character). */
4641 if (bo == 0) {
4642 if (size >= 4) {
4643 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 if (bom == 0x0000FEFF) {
4647 q += 4;
4648 bo = -1;
4649 }
4650 else if (bom == 0xFFFE0000) {
4651 q += 4;
4652 bo = 1;
4653 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004654#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 if (bom == 0x0000FEFF) {
4656 q += 4;
4657 bo = 1;
4658 }
4659 else if (bom == 0xFFFE0000) {
4660 q += 4;
4661 bo = -1;
4662 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004663#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004665 }
4666
4667 if (bo == -1) {
4668 /* force LE */
4669 iorder[0] = 0;
4670 iorder[1] = 1;
4671 iorder[2] = 2;
4672 iorder[3] = 3;
4673 }
4674 else if (bo == 1) {
4675 /* force BE */
4676 iorder[0] = 3;
4677 iorder[1] = 2;
4678 iorder[2] = 1;
4679 iorder[3] = 0;
4680 }
4681
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004682 /* On narrow builds we split characters outside the BMP into two
4683 codepoints => count how much extra space we need. */
4684#ifndef Py_UNICODE_WIDE
4685 for (qq = q; qq < e; qq += 4)
4686 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4687 pairs++;
4688#endif
4689
4690 /* This might be one to much, because of a BOM */
4691 unicode = _PyUnicode_New((size+3)/4+pairs);
4692 if (!unicode)
4693 return NULL;
4694 if (size == 0)
4695 return (PyObject *)unicode;
4696
4697 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004698 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004699
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 Py_UCS4 ch;
4702 /* remaining bytes at the end? (size should be divisible by 4) */
4703 if (e-q<4) {
4704 if (consumed)
4705 break;
4706 errmsg = "truncated data";
4707 startinpos = ((const char *)q)-starts;
4708 endinpos = ((const char *)e)-starts;
4709 goto utf32Error;
4710 /* The remaining input chars are ignored if the callback
4711 chooses to skip the input */
4712 }
4713 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4714 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004715
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 if (ch >= 0x110000)
4717 {
4718 errmsg = "codepoint not in range(0x110000)";
4719 startinpos = ((const char *)q)-starts;
4720 endinpos = startinpos+4;
4721 goto utf32Error;
4722 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 if (ch >= 0x10000)
4725 {
4726 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4727 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4728 }
4729 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004730#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 *p++ = ch;
4732 q += 4;
4733 continue;
4734 utf32Error:
4735 outpos = p-PyUnicode_AS_UNICODE(unicode);
4736 if (unicode_decode_call_errorhandler(
4737 errors, &errorHandler,
4738 "utf32", errmsg,
4739 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4740 &unicode, &outpos, &p))
4741 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742 }
4743
4744 if (byteorder)
4745 *byteorder = bo;
4746
4747 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004749
4750 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004751 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752 goto onError;
4753
4754 Py_XDECREF(errorHandler);
4755 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004756#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004757 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 Py_DECREF(unicode);
4759 return NULL;
4760 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004761#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004762 return (PyObject *)unicode;
4763
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004765 Py_DECREF(unicode);
4766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
4768 return NULL;
4769}
4770
4771PyObject *
4772PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 Py_ssize_t size,
4774 const char *errors,
4775 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004776{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004777 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004778 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004779 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004780#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004781 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004782#else
4783 const int pairs = 0;
4784#endif
4785 /* Offsets from p for storing byte pairs in the right order. */
4786#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4787 int iorder[] = {0, 1, 2, 3};
4788#else
4789 int iorder[] = {3, 2, 1, 0};
4790#endif
4791
Benjamin Peterson29060642009-01-31 22:14:21 +00004792#define STORECHAR(CH) \
4793 do { \
4794 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4795 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4796 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4797 p[iorder[0]] = (CH) & 0xff; \
4798 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799 } while(0)
4800
4801 /* In narrow builds we can output surrogate pairs as one codepoint,
4802 so we need less space. */
4803#ifndef Py_UNICODE_WIDE
4804 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4806 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4807 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004809 nsize = (size - pairs + (byteorder == 0));
4810 bytesize = nsize * 4;
4811 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004814 if (v == NULL)
4815 return NULL;
4816
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004817 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004818 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004820 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004821 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822
4823 if (byteorder == -1) {
4824 /* force LE */
4825 iorder[0] = 0;
4826 iorder[1] = 1;
4827 iorder[2] = 2;
4828 iorder[3] = 3;
4829 }
4830 else if (byteorder == 1) {
4831 /* force BE */
4832 iorder[0] = 3;
4833 iorder[1] = 2;
4834 iorder[2] = 1;
4835 iorder[3] = 0;
4836 }
4837
4838 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004840#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4842 Py_UCS4 ch2 = *s;
4843 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4844 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4845 s++;
4846 size--;
4847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004848 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004849#endif
4850 STORECHAR(ch);
4851 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004852
4853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004854 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004855#undef STORECHAR
4856}
4857
Alexander Belopolsky40018472011-02-26 01:02:56 +00004858PyObject *
4859PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860{
4861 if (!PyUnicode_Check(unicode)) {
4862 PyErr_BadArgument();
4863 return NULL;
4864 }
4865 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 PyUnicode_GET_SIZE(unicode),
4867 NULL,
4868 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004869}
4870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871/* --- UTF-16 Codec ------------------------------------------------------- */
4872
Tim Peters772747b2001-08-09 22:21:55 +00004873PyObject *
4874PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 Py_ssize_t size,
4876 const char *errors,
4877 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878{
Walter Dörwald69652032004-09-07 20:24:22 +00004879 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4880}
4881
Antoine Pitrouab868312009-01-10 15:40:25 +00004882/* Two masks for fast checking of whether a C 'long' may contain
4883 UTF16-encoded surrogate characters. This is an efficient heuristic,
4884 assuming that non-surrogate characters with a code point >= 0x8000 are
4885 rare in most input.
4886 FAST_CHAR_MASK is used when the input is in native byte ordering,
4887 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004888*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004889#if (SIZEOF_LONG == 8)
4890# define FAST_CHAR_MASK 0x8000800080008000L
4891# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4892#elif (SIZEOF_LONG == 4)
4893# define FAST_CHAR_MASK 0x80008000L
4894# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4895#else
4896# error C 'long' size should be either 4 or 8!
4897#endif
4898
Walter Dörwald69652032004-09-07 20:24:22 +00004899PyObject *
4900PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 Py_ssize_t size,
4902 const char *errors,
4903 int *byteorder,
4904 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004907 Py_ssize_t startinpos;
4908 Py_ssize_t endinpos;
4909 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 PyUnicodeObject *unicode;
4911 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004912 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004913 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004914 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004915 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004916 /* Offsets from q for retrieving byte pairs in the right order. */
4917#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4918 int ihi = 1, ilo = 0;
4919#else
4920 int ihi = 0, ilo = 1;
4921#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 PyObject *errorHandler = NULL;
4923 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
4925 /* Note: size will always be longer than the resulting Unicode
4926 character count */
4927 unicode = _PyUnicode_New(size);
4928 if (!unicode)
4929 return NULL;
4930 if (size == 0)
4931 return (PyObject *)unicode;
4932
4933 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004934 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004935 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004936 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
4938 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004939 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004941 /* Check for BOM marks (U+FEFF) in the input and adjust current
4942 byte order setting accordingly. In native mode, the leading BOM
4943 mark is skipped, in all other modes, it is copied to the output
4944 stream as-is (giving a ZWNBSP character). */
4945 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004946 if (size >= 2) {
4947 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004948#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 if (bom == 0xFEFF) {
4950 q += 2;
4951 bo = -1;
4952 }
4953 else if (bom == 0xFFFE) {
4954 q += 2;
4955 bo = 1;
4956 }
Tim Petersced69f82003-09-16 20:30:58 +00004957#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 if (bom == 0xFEFF) {
4959 q += 2;
4960 bo = 1;
4961 }
4962 else if (bom == 0xFFFE) {
4963 q += 2;
4964 bo = -1;
4965 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004966#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969
Tim Peters772747b2001-08-09 22:21:55 +00004970 if (bo == -1) {
4971 /* force LE */
4972 ihi = 1;
4973 ilo = 0;
4974 }
4975 else if (bo == 1) {
4976 /* force BE */
4977 ihi = 0;
4978 ilo = 1;
4979 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004980#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4981 native_ordering = ilo < ihi;
4982#else
4983 native_ordering = ilo > ihi;
4984#endif
Tim Peters772747b2001-08-09 22:21:55 +00004985
Antoine Pitrouab868312009-01-10 15:40:25 +00004986 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004987 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004989 /* First check for possible aligned read of a C 'long'. Unaligned
4990 reads are more expensive, better to defer to another iteration. */
4991 if (!((size_t) q & LONG_PTR_MASK)) {
4992 /* Fast path for runs of non-surrogate chars. */
4993 register const unsigned char *_q = q;
4994 Py_UNICODE *_p = p;
4995 if (native_ordering) {
4996 /* Native ordering is simple: as long as the input cannot
4997 possibly contain a surrogate char, do an unrolled copy
4998 of several 16-bit code points to the target object.
4999 The non-surrogate check is done on several input bytes
5000 at a time (as many as a C 'long' can contain). */
5001 while (_q < aligned_end) {
5002 unsigned long data = * (unsigned long *) _q;
5003 if (data & FAST_CHAR_MASK)
5004 break;
5005 _p[0] = ((unsigned short *) _q)[0];
5006 _p[1] = ((unsigned short *) _q)[1];
5007#if (SIZEOF_LONG == 8)
5008 _p[2] = ((unsigned short *) _q)[2];
5009 _p[3] = ((unsigned short *) _q)[3];
5010#endif
5011 _q += SIZEOF_LONG;
5012 _p += SIZEOF_LONG / 2;
5013 }
5014 }
5015 else {
5016 /* Byteswapped ordering is similar, but we must decompose
5017 the copy bytewise, and take care of zero'ing out the
5018 upper bytes if the target object is in 32-bit units
5019 (that is, in UCS-4 builds). */
5020 while (_q < aligned_end) {
5021 unsigned long data = * (unsigned long *) _q;
5022 if (data & SWAPPED_FAST_CHAR_MASK)
5023 break;
5024 /* Zero upper bytes in UCS-4 builds */
5025#if (Py_UNICODE_SIZE > 2)
5026 _p[0] = 0;
5027 _p[1] = 0;
5028#if (SIZEOF_LONG == 8)
5029 _p[2] = 0;
5030 _p[3] = 0;
5031#endif
5032#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005033 /* Issue #4916; UCS-4 builds on big endian machines must
5034 fill the two last bytes of each 4-byte unit. */
5035#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5036# define OFF 2
5037#else
5038# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005039#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005040 ((unsigned char *) _p)[OFF + 1] = _q[0];
5041 ((unsigned char *) _p)[OFF + 0] = _q[1];
5042 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5043 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5044#if (SIZEOF_LONG == 8)
5045 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5046 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5047 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5048 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5049#endif
5050#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005051 _q += SIZEOF_LONG;
5052 _p += SIZEOF_LONG / 2;
5053 }
5054 }
5055 p = _p;
5056 q = _q;
5057 if (q >= e)
5058 break;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061
Benjamin Peterson14339b62009-01-31 16:36:08 +00005062 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063
5064 if (ch < 0xD800 || ch > 0xDFFF) {
5065 *p++ = ch;
5066 continue;
5067 }
5068
5069 /* UTF-16 code pair: */
5070 if (q > e) {
5071 errmsg = "unexpected end of data";
5072 startinpos = (((const char *)q) - 2) - starts;
5073 endinpos = ((const char *)e) + 1 - starts;
5074 goto utf16Error;
5075 }
5076 if (0xD800 <= ch && ch <= 0xDBFF) {
5077 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5078 q += 2;
5079 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005080#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 *p++ = ch;
5082 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005083#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005085#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 continue;
5087 }
5088 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005089 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 startinpos = (((const char *)q)-4)-starts;
5091 endinpos = startinpos+2;
5092 goto utf16Error;
5093 }
5094
Benjamin Peterson14339b62009-01-31 16:36:08 +00005095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 errmsg = "illegal encoding";
5097 startinpos = (((const char *)q)-2)-starts;
5098 endinpos = startinpos+2;
5099 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 utf16Error:
5102 outpos = p - PyUnicode_AS_UNICODE(unicode);
5103 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005104 errors,
5105 &errorHandler,
5106 "utf16", errmsg,
5107 &starts,
5108 (const char **)&e,
5109 &startinpos,
5110 &endinpos,
5111 &exc,
5112 (const char **)&q,
5113 &unicode,
5114 &outpos,
5115 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005118 /* remaining byte at the end? (size should be even) */
5119 if (e == q) {
5120 if (!consumed) {
5121 errmsg = "truncated data";
5122 startinpos = ((const char *)q) - starts;
5123 endinpos = ((const char *)e) + 1 - starts;
5124 outpos = p - PyUnicode_AS_UNICODE(unicode);
5125 if (unicode_decode_call_errorhandler(
5126 errors,
5127 &errorHandler,
5128 "utf16", errmsg,
5129 &starts,
5130 (const char **)&e,
5131 &startinpos,
5132 &endinpos,
5133 &exc,
5134 (const char **)&q,
5135 &unicode,
5136 &outpos,
5137 &p))
5138 goto onError;
5139 /* The remaining input chars are ignored if the callback
5140 chooses to skip the input */
5141 }
5142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143
5144 if (byteorder)
5145 *byteorder = bo;
5146
Walter Dörwald69652032004-09-07 20:24:22 +00005147 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005149
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005151 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 goto onError;
5153
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 Py_XDECREF(errorHandler);
5155 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005156#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005157 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158 Py_DECREF(unicode);
5159 return NULL;
5160 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005161#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 return (PyObject *)unicode;
5163
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 Py_XDECREF(errorHandler);
5167 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 return NULL;
5169}
5170
Antoine Pitrouab868312009-01-10 15:40:25 +00005171#undef FAST_CHAR_MASK
5172#undef SWAPPED_FAST_CHAR_MASK
5173
Tim Peters772747b2001-08-09 22:21:55 +00005174PyObject *
5175PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 Py_ssize_t size,
5177 const char *errors,
5178 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005180 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005181 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005182 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005183#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005184 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005185#else
5186 const int pairs = 0;
5187#endif
Tim Peters772747b2001-08-09 22:21:55 +00005188 /* Offsets from p for storing byte pairs in the right order. */
5189#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5190 int ihi = 1, ilo = 0;
5191#else
5192 int ihi = 0, ilo = 1;
5193#endif
5194
Benjamin Peterson29060642009-01-31 22:14:21 +00005195#define STORECHAR(CH) \
5196 do { \
5197 p[ihi] = ((CH) >> 8) & 0xff; \
5198 p[ilo] = (CH) & 0xff; \
5199 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005200 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005202#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005203 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 if (s[i] >= 0x10000)
5205 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005206#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005207 /* 2 * (size + pairs + (byteorder == 0)) */
5208 if (size > PY_SSIZE_T_MAX ||
5209 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005211 nsize = size + pairs + (byteorder == 0);
5212 bytesize = nsize * 2;
5213 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005215 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (v == NULL)
5217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005219 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005222 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005223 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005224
5225 if (byteorder == -1) {
5226 /* force LE */
5227 ihi = 1;
5228 ilo = 0;
5229 }
5230 else if (byteorder == 1) {
5231 /* force BE */
5232 ihi = 0;
5233 ilo = 1;
5234 }
5235
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005236 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 Py_UNICODE ch = *s++;
5238 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005239#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 if (ch >= 0x10000) {
5241 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5242 ch = 0xD800 | ((ch-0x10000) >> 10);
5243 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005244#endif
Tim Peters772747b2001-08-09 22:21:55 +00005245 STORECHAR(ch);
5246 if (ch2)
5247 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005248 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005249
5250 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005251 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005252#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253}
5254
Alexander Belopolsky40018472011-02-26 01:02:56 +00005255PyObject *
5256PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257{
5258 if (!PyUnicode_Check(unicode)) {
5259 PyErr_BadArgument();
5260 return NULL;
5261 }
5262 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 PyUnicode_GET_SIZE(unicode),
5264 NULL,
5265 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266}
5267
5268/* --- Unicode Escape Codec ----------------------------------------------- */
5269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005270/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5271 if all the escapes in the string make it still a valid ASCII string.
5272 Returns -1 if any escapes were found which cause the string to
5273 pop out of ASCII range. Otherwise returns the length of the
5274 required buffer to hold the string.
5275 */
5276Py_ssize_t
5277length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5278{
5279 const unsigned char *p = (const unsigned char *)s;
5280 const unsigned char *end = p + size;
5281 Py_ssize_t length = 0;
5282
5283 if (size < 0)
5284 return -1;
5285
5286 for (; p < end; ++p) {
5287 if (*p > 127) {
5288 /* Non-ASCII */
5289 return -1;
5290 }
5291 else if (*p != '\\') {
5292 /* Normal character */
5293 ++length;
5294 }
5295 else {
5296 /* Backslash-escape, check next char */
5297 ++p;
5298 /* Escape sequence reaches till end of string or
5299 non-ASCII follow-up. */
5300 if (p >= end || *p > 127)
5301 return -1;
5302 switch (*p) {
5303 case '\n':
5304 /* backslash + \n result in zero characters */
5305 break;
5306 case '\\': case '\'': case '\"':
5307 case 'b': case 'f': case 't':
5308 case 'n': case 'r': case 'v': case 'a':
5309 ++length;
5310 break;
5311 case '0': case '1': case '2': case '3':
5312 case '4': case '5': case '6': case '7':
5313 case 'x': case 'u': case 'U': case 'N':
5314 /* these do not guarantee ASCII characters */
5315 return -1;
5316 default:
5317 /* count the backslash + the other character */
5318 length += 2;
5319 }
5320 }
5321 }
5322 return length;
5323}
5324
5325/* Similar to PyUnicode_WRITE but either write into wstr field
5326 or treat string as ASCII. */
5327#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5328 do { \
5329 if ((kind) != PyUnicode_WCHAR_KIND) \
5330 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5331 else \
5332 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5333 } while (0)
5334
5335#define WRITE_WSTR(buf, index, value) \
5336 assert(kind == PyUnicode_WCHAR_KIND), \
5337 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5338
5339
Fredrik Lundh06d12682001-01-24 07:59:11 +00005340static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
5343PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005344 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t startinpos;
5349 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005350 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005354 char* message;
5355 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 PyObject *errorHandler = NULL;
5357 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005358 Py_ssize_t ascii_length;
5359 Py_ssize_t i;
5360 int kind;
5361 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005363 ascii_length = length_of_escaped_ascii_string(s, size);
5364
5365 /* After length_of_escaped_ascii_string() there are two alternatives,
5366 either the string is pure ASCII with named escapes like \n, etc.
5367 and we determined it's exact size (common case)
5368 or it contains \x, \u, ... escape sequences. then we create a
5369 legacy wchar string and resize it at the end of this function. */
5370 if (ascii_length >= 0) {
5371 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5372 if (!v)
5373 goto onError;
5374 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5375 kind = PyUnicode_1BYTE_KIND;
5376 data = PyUnicode_DATA(v);
5377 }
5378 else {
5379 /* Escaped strings will always be longer than the resulting
5380 Unicode string, so we start with size here and then reduce the
5381 length after conversion to the true value.
5382 (but if the error callback returns a long replacement string
5383 we'll have to allocate more space) */
5384 v = _PyUnicode_New(size);
5385 if (!v)
5386 goto onError;
5387 kind = PyUnicode_WCHAR_KIND;
5388 data = PyUnicode_AS_UNICODE(v);
5389 }
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 if (size == 0)
5392 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 while (s < end) {
5397 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005398 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 if (kind == PyUnicode_WCHAR_KIND) {
5402 assert(i < _PyUnicode_WSTR_LENGTH(v));
5403 }
5404 else {
5405 /* The only case in which i == ascii_length is a backslash
5406 followed by a newline. */
5407 assert(i <= ascii_length);
5408 }
5409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 /* Non-escape characters are interpreted as Unicode ordinals */
5411 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 continue;
5414 }
5415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 /* \ - Escapes */
5418 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005419 c = *s++;
5420 if (s > end)
5421 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422
5423 if (kind == PyUnicode_WCHAR_KIND) {
5424 assert(i < _PyUnicode_WSTR_LENGTH(v));
5425 }
5426 else {
5427 /* The only case in which i == ascii_length is a backslash
5428 followed by a newline. */
5429 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5430 }
5431
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005432 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5437 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5438 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5439 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5440 /* FF */
5441 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5442 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5443 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5444 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5445 /* VT */
5446 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5447 /* BEL, not classic C */
5448 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 case '0': case '1': case '2': case '3':
5452 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005453 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005454 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005455 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005456 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005457 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 break;
5461
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 /* hex escapes */
5463 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005465 digits = 2;
5466 message = "truncated \\xXX escape";
5467 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005471 digits = 4;
5472 message = "truncated \\uXXXX escape";
5473 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005476 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005477 digits = 8;
5478 message = "truncated \\UXXXXXXXX escape";
5479 hexescape:
5480 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 if (s+digits>end) {
5483 endinpos = size;
5484 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 errors, &errorHandler,
5486 "unicodeescape", "end of string in escape sequence",
5487 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 goto nextByte;
5492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005493 for (j = 0; j < digits; ++j) {
5494 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005495 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005496 endinpos = (s+j+1)-starts;
5497 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 errors, &errorHandler,
5500 "unicodeescape", message,
5501 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005502 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005503 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005506 }
5507 chr = (chr<<4) & ~0xF;
5508 if (c >= '0' && c <= '9')
5509 chr += c - '0';
5510 else if (c >= 'a' && c <= 'f')
5511 chr += 10 + c - 'a';
5512 else
5513 chr += 10 + c - 'A';
5514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005516 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 /* _decoding_error will have already written into the
5518 target buffer. */
5519 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005520 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005521 /* when we get here, chr is a 32-bit unicode character */
5522 if (chr <= 0xffff)
5523 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005524 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005525 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005526 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005527 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005528#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005530#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005531 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5533 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005534#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 errors, &errorHandler,
5540 "unicodeescape", "illegal Unicode character",
5541 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005543 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005545 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 break;
5547
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005549 case 'N':
5550 message = "malformed \\N character escape";
5551 if (ucnhash_CAPI == NULL) {
5552 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5554 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005555 if (ucnhash_CAPI == NULL)
5556 goto ucnhashError;
5557 }
5558 if (*s == '{') {
5559 const char *start = s+1;
5560 /* look for the closing brace */
5561 while (*s != '}' && s < end)
5562 s++;
5563 if (s > start && s < end && *s == '}') {
5564 /* found a name. look it up in the unicode database */
5565 message = "unknown Unicode character name";
5566 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5568 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569 goto store;
5570 }
5571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 errors, &errorHandler,
5576 "unicodeescape", message,
5577 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005579 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005581 break;
5582
5583 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005584 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 message = "\\ at end of string";
5587 s--;
5588 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 errors, &errorHandler,
5592 "unicodeescape", message,
5593 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005595 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005597 }
5598 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5600 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005601 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005602 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 /* Ensure the length prediction worked in case of ASCII strings */
5608 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5609
Victor Stinnerfe226c02011-10-03 03:52:20 +02005610 if (kind == PyUnicode_WCHAR_KIND)
5611 {
5612 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5613 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005614 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005615 Py_XDECREF(errorHandler);
5616 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005617#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005618 if (_PyUnicode_READY_REPLACE(&v)) {
5619 Py_DECREF(v);
5620 return NULL;
5621 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005622#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005624
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005626 PyErr_SetString(
5627 PyExc_UnicodeError,
5628 "\\N escapes not supported (can't load unicodedata module)"
5629 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005630 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631 Py_XDECREF(errorHandler);
5632 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005633 return NULL;
5634
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 Py_XDECREF(errorHandler);
5638 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 return NULL;
5640}
5641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642#undef WRITE_ASCII_OR_WSTR
5643#undef WRITE_WSTR
5644
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645/* Return a Unicode-Escape string version of the Unicode object.
5646
5647 If quotes is true, the string is enclosed in u"" or u'' quotes as
5648 appropriate.
5649
5650*/
5651
Walter Dörwald79e913e2007-05-12 11:08:06 +00005652static const char *hexdigits = "0123456789abcdef";
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
5655PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005656 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005661#ifdef Py_UNICODE_WIDE
5662 const Py_ssize_t expandsize = 10;
5663#else
5664 const Py_ssize_t expandsize = 6;
5665#endif
5666
Thomas Wouters89f507f2006-12-13 04:49:30 +00005667 /* XXX(nnorwitz): rather than over-allocating, it would be
5668 better to choose a different scheme. Perhaps scan the
5669 first N-chars of the string and allocate based on that size.
5670 */
5671 /* Initial allocation is based on the longest-possible unichr
5672 escape.
5673
5674 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5675 unichr, so in this case it's the longest unichr escape. In
5676 narrow (UTF-16) builds this is five chars per source unichr
5677 since there are two unichrs in the surrogate pair, so in narrow
5678 (UTF-16) builds it's not the longest unichr escape.
5679
5680 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5681 so in the narrow (UTF-16) build case it's the longest unichr
5682 escape.
5683 */
5684
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005685 if (size == 0)
5686 return PyBytes_FromStringAndSize(NULL, 0);
5687
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005688 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005690
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005691 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 2
5693 + expandsize*size
5694 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 if (repr == NULL)
5696 return NULL;
5697
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005698 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 while (size-- > 0) {
5701 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005702
Walter Dörwald79e913e2007-05-12 11:08:06 +00005703 /* Escape backslashes */
5704 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 *p++ = '\\';
5706 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005707 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005708 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005709
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005710#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005711 /* Map 21-bit characters to '\U00xxxxxx' */
5712 else if (ch >= 0x10000) {
5713 *p++ = '\\';
5714 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005715 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5716 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5717 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5718 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5719 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5720 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5721 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5722 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005724 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005725#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5727 else if (ch >= 0xD800 && ch < 0xDC00) {
5728 Py_UNICODE ch2;
5729 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005730
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 ch2 = *s++;
5732 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005733 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5735 *p++ = '\\';
5736 *p++ = 'U';
5737 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5738 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5739 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5740 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5741 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5742 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5743 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5744 *p++ = hexdigits[ucs & 0x0000000F];
5745 continue;
5746 }
5747 /* Fall through: isolated surrogates are copied as-is */
5748 s--;
5749 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005750 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005751#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005754 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 *p++ = '\\';
5756 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005757 *p++ = hexdigits[(ch >> 12) & 0x000F];
5758 *p++ = hexdigits[(ch >> 8) & 0x000F];
5759 *p++ = hexdigits[(ch >> 4) & 0x000F];
5760 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005762
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005763 /* Map special whitespace to '\t', \n', '\r' */
5764 else if (ch == '\t') {
5765 *p++ = '\\';
5766 *p++ = 't';
5767 }
5768 else if (ch == '\n') {
5769 *p++ = '\\';
5770 *p++ = 'n';
5771 }
5772 else if (ch == '\r') {
5773 *p++ = '\\';
5774 *p++ = 'r';
5775 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005776
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005777 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005778 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005780 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005781 *p++ = hexdigits[(ch >> 4) & 0x000F];
5782 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005783 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005784
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 /* Copy everything else as-is */
5786 else
5787 *p++ = (char) ch;
5788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 assert(p - PyBytes_AS_STRING(repr) > 0);
5791 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5792 return NULL;
5793 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794}
5795
Alexander Belopolsky40018472011-02-26 01:02:56 +00005796PyObject *
5797PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005799 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 if (!PyUnicode_Check(unicode)) {
5801 PyErr_BadArgument();
5802 return NULL;
5803 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005804 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5805 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005806 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807}
5808
5809/* --- Raw Unicode Escape Codec ------------------------------------------- */
5810
Alexander Belopolsky40018472011-02-26 01:02:56 +00005811PyObject *
5812PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005813 Py_ssize_t size,
5814 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005817 Py_ssize_t startinpos;
5818 Py_ssize_t endinpos;
5819 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 const char *end;
5823 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 PyObject *errorHandler = NULL;
5825 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 /* Escaped strings will always be longer than the resulting
5828 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 length after conversion to the true value. (But decoding error
5830 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 v = _PyUnicode_New(size);
5832 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 end = s + size;
5838 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 unsigned char c;
5840 Py_UCS4 x;
5841 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005842 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 /* Non-escape characters are interpreted as Unicode ordinals */
5845 if (*s != '\\') {
5846 *p++ = (unsigned char)*s++;
5847 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005848 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 startinpos = s-starts;
5850
5851 /* \u-escapes are only interpreted iff the number of leading
5852 backslashes if odd */
5853 bs = s;
5854 for (;s < end;) {
5855 if (*s != '\\')
5856 break;
5857 *p++ = (unsigned char)*s++;
5858 }
5859 if (((s - bs) & 1) == 0 ||
5860 s >= end ||
5861 (*s != 'u' && *s != 'U')) {
5862 continue;
5863 }
5864 p--;
5865 count = *s=='u' ? 4 : 8;
5866 s++;
5867
5868 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5869 outpos = p-PyUnicode_AS_UNICODE(v);
5870 for (x = 0, i = 0; i < count; ++i, ++s) {
5871 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005872 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 endinpos = s-starts;
5874 if (unicode_decode_call_errorhandler(
5875 errors, &errorHandler,
5876 "rawunicodeescape", "truncated \\uXXXX",
5877 &starts, &end, &startinpos, &endinpos, &exc, &s,
5878 &v, &outpos, &p))
5879 goto onError;
5880 goto nextByte;
5881 }
5882 x = (x<<4) & ~0xF;
5883 if (c >= '0' && c <= '9')
5884 x += c - '0';
5885 else if (c >= 'a' && c <= 'f')
5886 x += 10 + c - 'a';
5887 else
5888 x += 10 + c - 'A';
5889 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005890 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 /* UCS-2 character */
5892 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005893 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 /* UCS-4 character. Either store directly, or as
5895 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005896#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005898#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 x -= 0x10000L;
5900 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5901 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005902#endif
5903 } else {
5904 endinpos = s-starts;
5905 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005906 if (unicode_decode_call_errorhandler(
5907 errors, &errorHandler,
5908 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 &starts, &end, &startinpos, &endinpos, &exc, &s,
5910 &v, &outpos, &p))
5911 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 nextByte:
5914 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005916 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 Py_XDECREF(errorHandler);
5919 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005920#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005921 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005922 Py_DECREF(v);
5923 return NULL;
5924 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005925#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005927
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 Py_XDECREF(errorHandler);
5931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return NULL;
5933}
5934
Alexander Belopolsky40018472011-02-26 01:02:56 +00005935PyObject *
5936PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005937 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005939 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 char *p;
5941 char *q;
5942
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005943#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005944 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005945#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005946 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005947#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005948
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005949 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 if (repr == NULL)
5954 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005955 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005958 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 while (size-- > 0) {
5960 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005961#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 /* Map 32-bit characters to '\Uxxxxxxxx' */
5963 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005964 *p++ = '\\';
5965 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005966 *p++ = hexdigits[(ch >> 28) & 0xf];
5967 *p++ = hexdigits[(ch >> 24) & 0xf];
5968 *p++ = hexdigits[(ch >> 20) & 0xf];
5969 *p++ = hexdigits[(ch >> 16) & 0xf];
5970 *p++ = hexdigits[(ch >> 12) & 0xf];
5971 *p++ = hexdigits[(ch >> 8) & 0xf];
5972 *p++ = hexdigits[(ch >> 4) & 0xf];
5973 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005974 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005976#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5978 if (ch >= 0xD800 && ch < 0xDC00) {
5979 Py_UNICODE ch2;
5980 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 ch2 = *s++;
5983 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005984 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5986 *p++ = '\\';
5987 *p++ = 'U';
5988 *p++ = hexdigits[(ucs >> 28) & 0xf];
5989 *p++ = hexdigits[(ucs >> 24) & 0xf];
5990 *p++ = hexdigits[(ucs >> 20) & 0xf];
5991 *p++ = hexdigits[(ucs >> 16) & 0xf];
5992 *p++ = hexdigits[(ucs >> 12) & 0xf];
5993 *p++ = hexdigits[(ucs >> 8) & 0xf];
5994 *p++ = hexdigits[(ucs >> 4) & 0xf];
5995 *p++ = hexdigits[ucs & 0xf];
5996 continue;
5997 }
5998 /* Fall through: isolated surrogates are copied as-is */
5999 s--;
6000 size++;
6001 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006002#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* Map 16-bit characters to '\uxxxx' */
6004 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 *p++ = '\\';
6006 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006007 *p++ = hexdigits[(ch >> 12) & 0xf];
6008 *p++ = hexdigits[(ch >> 8) & 0xf];
6009 *p++ = hexdigits[(ch >> 4) & 0xf];
6010 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* Copy everything else as-is */
6013 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 *p++ = (char) ch;
6015 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006016 size = p - q;
6017
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006018 assert(size > 0);
6019 if (_PyBytes_Resize(&repr, size) < 0)
6020 return NULL;
6021 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
6025PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006027 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006029 PyErr_BadArgument();
6030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006032 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6033 PyUnicode_GET_SIZE(unicode));
6034
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006035 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006038/* --- Unicode Internal Codec ------------------------------------------- */
6039
Alexander Belopolsky40018472011-02-26 01:02:56 +00006040PyObject *
6041_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006042 Py_ssize_t size,
6043 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006044{
6045 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006046 Py_ssize_t startinpos;
6047 Py_ssize_t endinpos;
6048 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006049 PyUnicodeObject *v;
6050 Py_UNICODE *p;
6051 const char *end;
6052 const char *reason;
6053 PyObject *errorHandler = NULL;
6054 PyObject *exc = NULL;
6055
Neal Norwitzd43069c2006-01-08 01:12:10 +00006056#ifdef Py_UNICODE_WIDE
6057 Py_UNICODE unimax = PyUnicode_GetMax();
6058#endif
6059
Thomas Wouters89f507f2006-12-13 04:49:30 +00006060 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006061 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6062 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006064 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6065 as string was created with the old API. */
6066 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 p = PyUnicode_AS_UNICODE(v);
6069 end = s + size;
6070
6071 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006072 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006073 /* We have to sanity check the raw data, otherwise doom looms for
6074 some malformed UCS-4 data. */
6075 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006076#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006078#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006079 end-s < Py_UNICODE_SIZE
6080 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006082 startinpos = s - starts;
6083 if (end-s < Py_UNICODE_SIZE) {
6084 endinpos = end-starts;
6085 reason = "truncated input";
6086 }
6087 else {
6088 endinpos = s - starts + Py_UNICODE_SIZE;
6089 reason = "illegal code point (> 0x10FFFF)";
6090 }
6091 outpos = p - PyUnicode_AS_UNICODE(v);
6092 if (unicode_decode_call_errorhandler(
6093 errors, &errorHandler,
6094 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006095 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006096 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006097 goto onError;
6098 }
6099 }
6100 else {
6101 p++;
6102 s += Py_UNICODE_SIZE;
6103 }
6104 }
6105
Victor Stinnerfe226c02011-10-03 03:52:20 +02006106 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 goto onError;
6108 Py_XDECREF(errorHandler);
6109 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006110#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006111 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006112 Py_DECREF(v);
6113 return NULL;
6114 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006115#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006116 return (PyObject *)v;
6117
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006119 Py_XDECREF(v);
6120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
6122 return NULL;
6123}
6124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125/* --- Latin-1 Codec ------------------------------------------------------ */
6126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
6128PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006129 Py_ssize_t size,
6130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006133 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006137static void
6138make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006139 const char *encoding,
6140 const Py_UNICODE *unicode, Py_ssize_t size,
6141 Py_ssize_t startpos, Py_ssize_t endpos,
6142 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006144 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 *exceptionObject = PyUnicodeEncodeError_Create(
6146 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
6148 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6150 goto onError;
6151 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6152 goto onError;
6153 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6154 goto onError;
6155 return;
6156 onError:
6157 Py_DECREF(*exceptionObject);
6158 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
6160}
6161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163static void
6164raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006165 const char *encoding,
6166 const Py_UNICODE *unicode, Py_ssize_t size,
6167 Py_ssize_t startpos, Py_ssize_t endpos,
6168 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169{
6170 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174}
6175
6176/* error handling callback helper:
6177 build arguments, call the callback and check the arguments,
6178 put the result into newpos and return the replacement string, which
6179 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006180static PyObject *
6181unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006182 PyObject **errorHandler,
6183 const char *encoding, const char *reason,
6184 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6185 Py_ssize_t startpos, Py_ssize_t endpos,
6186 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006188 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189
6190 PyObject *restuple;
6191 PyObject *resunicode;
6192
6193 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197 }
6198
6199 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
6204 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006206 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006209 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 Py_DECREF(restuple);
6211 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006213 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 &resunicode, newpos)) {
6215 Py_DECREF(restuple);
6216 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006218 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6219 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6220 Py_DECREF(restuple);
6221 return NULL;
6222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006225 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6227 Py_DECREF(restuple);
6228 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006229 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006230 Py_INCREF(resunicode);
6231 Py_DECREF(restuple);
6232 return resunicode;
6233}
6234
Alexander Belopolsky40018472011-02-26 01:02:56 +00006235static PyObject *
6236unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006237 Py_ssize_t size,
6238 const char *errors,
6239 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006240{
6241 /* output object */
6242 PyObject *res;
6243 /* pointers to the beginning and end+1 of input */
6244 const Py_UNICODE *startp = p;
6245 const Py_UNICODE *endp = p + size;
6246 /* pointer to the beginning of the unencodable characters */
6247 /* const Py_UNICODE *badp = NULL; */
6248 /* pointer into the output */
6249 char *str;
6250 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006251 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006252 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6253 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 PyObject *errorHandler = NULL;
6255 PyObject *exc = NULL;
6256 /* the following variable is used for caching string comparisons
6257 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6258 int known_errorHandler = -1;
6259
6260 /* allocate enough for a simple encoding without
6261 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006262 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006263 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006264 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006265 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006266 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006267 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 ressize = size;
6269
6270 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 /* can we encode this? */
6274 if (c<limit) {
6275 /* no overflow check, because we know that the space is enough */
6276 *str++ = (char)c;
6277 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006278 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 else {
6280 Py_ssize_t unicodepos = p-startp;
6281 Py_ssize_t requiredsize;
6282 PyObject *repunicode;
6283 Py_ssize_t repsize;
6284 Py_ssize_t newpos;
6285 Py_ssize_t respos;
6286 Py_UNICODE *uni2;
6287 /* startpos for collecting unencodable chars */
6288 const Py_UNICODE *collstart = p;
6289 const Py_UNICODE *collend = p;
6290 /* find all unecodable characters */
6291 while ((collend < endp) && ((*collend)>=limit))
6292 ++collend;
6293 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6294 if (known_errorHandler==-1) {
6295 if ((errors==NULL) || (!strcmp(errors, "strict")))
6296 known_errorHandler = 1;
6297 else if (!strcmp(errors, "replace"))
6298 known_errorHandler = 2;
6299 else if (!strcmp(errors, "ignore"))
6300 known_errorHandler = 3;
6301 else if (!strcmp(errors, "xmlcharrefreplace"))
6302 known_errorHandler = 4;
6303 else
6304 known_errorHandler = 0;
6305 }
6306 switch (known_errorHandler) {
6307 case 1: /* strict */
6308 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6309 goto onError;
6310 case 2: /* replace */
6311 while (collstart++<collend)
6312 *str++ = '?'; /* fall through */
6313 case 3: /* ignore */
6314 p = collend;
6315 break;
6316 case 4: /* xmlcharrefreplace */
6317 respos = str - PyBytes_AS_STRING(res);
6318 /* determine replacement size (temporarily (mis)uses p) */
6319 for (p = collstart, repsize = 0; p < collend; ++p) {
6320 if (*p<10)
6321 repsize += 2+1+1;
6322 else if (*p<100)
6323 repsize += 2+2+1;
6324 else if (*p<1000)
6325 repsize += 2+3+1;
6326 else if (*p<10000)
6327 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006328#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 else
6330 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006331#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 else if (*p<100000)
6333 repsize += 2+5+1;
6334 else if (*p<1000000)
6335 repsize += 2+6+1;
6336 else
6337 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006338#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 }
6340 requiredsize = respos+repsize+(endp-collend);
6341 if (requiredsize > ressize) {
6342 if (requiredsize<2*ressize)
6343 requiredsize = 2*ressize;
6344 if (_PyBytes_Resize(&res, requiredsize))
6345 goto onError;
6346 str = PyBytes_AS_STRING(res) + respos;
6347 ressize = requiredsize;
6348 }
6349 /* generate replacement (temporarily (mis)uses p) */
6350 for (p = collstart; p < collend; ++p) {
6351 str += sprintf(str, "&#%d;", (int)*p);
6352 }
6353 p = collend;
6354 break;
6355 default:
6356 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6357 encoding, reason, startp, size, &exc,
6358 collstart-startp, collend-startp, &newpos);
6359 if (repunicode == NULL)
6360 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006361 if (PyBytes_Check(repunicode)) {
6362 /* Directly copy bytes result to output. */
6363 repsize = PyBytes_Size(repunicode);
6364 if (repsize > 1) {
6365 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006366 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006367 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6368 Py_DECREF(repunicode);
6369 goto onError;
6370 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006371 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006372 ressize += repsize-1;
6373 }
6374 memcpy(str, PyBytes_AsString(repunicode), repsize);
6375 str += repsize;
6376 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006377 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006378 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 /* need more space? (at least enough for what we
6381 have+the replacement+the rest of the string, so
6382 we won't have to check space for encodable characters) */
6383 respos = str - PyBytes_AS_STRING(res);
6384 repsize = PyUnicode_GET_SIZE(repunicode);
6385 requiredsize = respos+repsize+(endp-collend);
6386 if (requiredsize > ressize) {
6387 if (requiredsize<2*ressize)
6388 requiredsize = 2*ressize;
6389 if (_PyBytes_Resize(&res, requiredsize)) {
6390 Py_DECREF(repunicode);
6391 goto onError;
6392 }
6393 str = PyBytes_AS_STRING(res) + respos;
6394 ressize = requiredsize;
6395 }
6396 /* check if there is anything unencodable in the replacement
6397 and copy it to the output */
6398 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6399 c = *uni2;
6400 if (c >= limit) {
6401 raise_encode_exception(&exc, encoding, startp, size,
6402 unicodepos, unicodepos+1, reason);
6403 Py_DECREF(repunicode);
6404 goto onError;
6405 }
6406 *str = (char)c;
6407 }
6408 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006409 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006411 }
6412 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006413 /* Resize if we allocated to much */
6414 size = str - PyBytes_AS_STRING(res);
6415 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006416 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006417 if (_PyBytes_Resize(&res, size) < 0)
6418 goto onError;
6419 }
6420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 Py_XDECREF(errorHandler);
6422 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006423 return res;
6424
6425 onError:
6426 Py_XDECREF(res);
6427 Py_XDECREF(errorHandler);
6428 Py_XDECREF(exc);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430}
6431
Alexander Belopolsky40018472011-02-26 01:02:56 +00006432PyObject *
6433PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006434 Py_ssize_t size,
6435 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438}
6439
Alexander Belopolsky40018472011-02-26 01:02:56 +00006440PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006441_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442{
6443 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 PyErr_BadArgument();
6445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006447 if (PyUnicode_READY(unicode) == -1)
6448 return NULL;
6449 /* Fast path: if it is a one-byte string, construct
6450 bytes object directly. */
6451 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6452 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6453 PyUnicode_GET_LENGTH(unicode));
6454 /* Non-Latin-1 characters present. Defer to above function to
6455 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006458 errors);
6459}
6460
6461PyObject*
6462PyUnicode_AsLatin1String(PyObject *unicode)
6463{
6464 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465}
6466
6467/* --- 7-bit ASCII Codec -------------------------------------------------- */
6468
Alexander Belopolsky40018472011-02-26 01:02:56 +00006469PyObject *
6470PyUnicode_DecodeASCII(const char *s,
6471 Py_ssize_t size,
6472 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 PyUnicodeObject *v;
6476 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t startinpos;
6478 Py_ssize_t endinpos;
6479 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006481 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 PyObject *errorHandler = NULL;
6483 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006484 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006487 if (size == 1 && *(unsigned char*)s < 128)
6488 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6489
6490 /* Fast path. Assume the input actually *is* ASCII, and allocate
6491 a single-block Unicode object with that assumption. If there is
6492 an error, drop the object and start over. */
6493 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6494 if (v == NULL)
6495 goto onError;
6496 d = PyUnicode_1BYTE_DATA(v);
6497 for (i = 0; i < size; i++) {
6498 unsigned char ch = ((unsigned char*)s)[i];
6499 if (ch < 128)
6500 d[i] = ch;
6501 else
6502 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006504 if (i == size)
6505 return (PyObject*)v;
6506 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006507
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 v = _PyUnicode_New(size);
6509 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 e = s + size;
6515 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 register unsigned char c = (unsigned char)*s;
6517 if (c < 128) {
6518 *p++ = c;
6519 ++s;
6520 }
6521 else {
6522 startinpos = s-starts;
6523 endinpos = startinpos + 1;
6524 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6525 if (unicode_decode_call_errorhandler(
6526 errors, &errorHandler,
6527 "ascii", "ordinal not in range(128)",
6528 &starts, &e, &startinpos, &endinpos, &exc, &s,
6529 &v, &outpos, &p))
6530 goto onError;
6531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006533 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006534 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 Py_XDECREF(errorHandler);
6537 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006538#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006539 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006540 Py_DECREF(v);
6541 return NULL;
6542 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006543#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006545
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006548 Py_XDECREF(errorHandler);
6549 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return NULL;
6551}
6552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
6554PyUnicode_EncodeASCII(const Py_UNICODE *p,
6555 Py_ssize_t size,
6556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559}
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006562_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
6564 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 PyErr_BadArgument();
6566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006568 if (PyUnicode_READY(unicode) == -1)
6569 return NULL;
6570 /* Fast path: if it is an ASCII-only string, construct bytes object
6571 directly. Else defer to above function to raise the exception. */
6572 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6573 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6574 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006577 errors);
6578}
6579
6580PyObject *
6581PyUnicode_AsASCIIString(PyObject *unicode)
6582{
6583 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
Victor Stinner99b95382011-07-04 14:23:54 +02006586#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006588/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006589
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006590#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006591#define NEED_RETRY
6592#endif
6593
6594/* XXX This code is limited to "true" double-byte encodings, as
6595 a) it assumes an incomplete character consists of a single byte, and
6596 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006598
Alexander Belopolsky40018472011-02-26 01:02:56 +00006599static int
6600is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601{
6602 const char *curr = s + offset;
6603
6604 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 const char *prev = CharPrev(s, curr);
6606 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607 }
6608 return 0;
6609}
6610
6611/*
6612 * Decode MBCS string into unicode object. If 'final' is set, converts
6613 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6614 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static int
6616decode_mbcs(PyUnicodeObject **v,
6617 const char *s, /* MBCS string */
6618 int size, /* sizeof MBCS string */
6619 int final,
6620 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006621{
6622 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006623 Py_ssize_t n;
6624 DWORD usize;
6625 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
6627 assert(size >= 0);
6628
Victor Stinner554f3f02010-06-16 23:33:54 +00006629 /* check and handle 'errors' arg */
6630 if (errors==NULL || strcmp(errors, "strict")==0)
6631 flags = MB_ERR_INVALID_CHARS;
6632 else if (strcmp(errors, "ignore")==0)
6633 flags = 0;
6634 else {
6635 PyErr_Format(PyExc_ValueError,
6636 "mbcs encoding does not support errors='%s'",
6637 errors);
6638 return -1;
6639 }
6640
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006641 /* Skip trailing lead-byte unless 'final' is set */
6642 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006644
6645 /* First get the size of the result */
6646 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006647 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6648 if (usize==0)
6649 goto mbcs_decode_error;
6650 } else
6651 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006652
6653 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 /* Create unicode object */
6655 *v = _PyUnicode_New(usize);
6656 if (*v == NULL)
6657 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006658 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006659 }
6660 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 /* Extend unicode object */
6662 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006663 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665 }
6666
6667 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006668 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006670 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6671 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006674 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006675
6676mbcs_decode_error:
6677 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6678 we raise a UnicodeDecodeError - else it is a 'generic'
6679 windows error
6680 */
6681 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6682 /* Ideally, we should get reason from FormatMessage - this
6683 is the Windows 2000 English version of the message
6684 */
6685 PyObject *exc = NULL;
6686 const char *reason = "No mapping for the Unicode character exists "
6687 "in the target multi-byte code page.";
6688 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6689 if (exc != NULL) {
6690 PyCodec_StrictErrors(exc);
6691 Py_DECREF(exc);
6692 }
6693 } else {
6694 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6695 }
6696 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699PyObject *
6700PyUnicode_DecodeMBCSStateful(const char *s,
6701 Py_ssize_t size,
6702 const char *errors,
6703 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704{
6705 PyUnicodeObject *v = NULL;
6706 int done;
6707
6708 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710
6711#ifdef NEED_RETRY
6712 retry:
6713 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006714 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715 else
6716#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006717 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718
6719 if (done < 0) {
6720 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722 }
6723
6724 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726
6727#ifdef NEED_RETRY
6728 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 s += done;
6730 size -= done;
6731 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006734#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006735 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006736 Py_DECREF(v);
6737 return NULL;
6738 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006739#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740 return (PyObject *)v;
6741}
6742
Alexander Belopolsky40018472011-02-26 01:02:56 +00006743PyObject *
6744PyUnicode_DecodeMBCS(const char *s,
6745 Py_ssize_t size,
6746 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006747{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006748 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6749}
6750
6751/*
6752 * Convert unicode into string object (MBCS).
6753 * Returns 0 if succeed, -1 otherwise.
6754 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006755static int
6756encode_mbcs(PyObject **repr,
6757 const Py_UNICODE *p, /* unicode */
6758 int size, /* size of unicode */
6759 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760{
Victor Stinner554f3f02010-06-16 23:33:54 +00006761 BOOL usedDefaultChar = FALSE;
6762 BOOL *pusedDefaultChar;
6763 int mbcssize;
6764 Py_ssize_t n;
6765 PyObject *exc = NULL;
6766 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767
6768 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006769
Victor Stinner554f3f02010-06-16 23:33:54 +00006770 /* check and handle 'errors' arg */
6771 if (errors==NULL || strcmp(errors, "strict")==0) {
6772 flags = WC_NO_BEST_FIT_CHARS;
6773 pusedDefaultChar = &usedDefaultChar;
6774 } else if (strcmp(errors, "replace")==0) {
6775 flags = 0;
6776 pusedDefaultChar = NULL;
6777 } else {
6778 PyErr_Format(PyExc_ValueError,
6779 "mbcs encoding does not support errors='%s'",
6780 errors);
6781 return -1;
6782 }
6783
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006784 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006786 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6787 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (mbcssize == 0) {
6789 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6790 return -1;
6791 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006792 /* If we used a default char, then we failed! */
6793 if (pusedDefaultChar && *pusedDefaultChar)
6794 goto mbcs_encode_error;
6795 } else {
6796 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006797 }
6798
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006799 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 /* Create string object */
6801 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6802 if (*repr == NULL)
6803 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006804 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805 }
6806 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* Extend string object */
6808 n = PyBytes_Size(*repr);
6809 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6810 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811 }
6812
6813 /* Do the conversion */
6814 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006816 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6817 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6819 return -1;
6820 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006821 if (pusedDefaultChar && *pusedDefaultChar)
6822 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006825
6826mbcs_encode_error:
6827 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6828 Py_XDECREF(exc);
6829 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006830}
6831
Alexander Belopolsky40018472011-02-26 01:02:56 +00006832PyObject *
6833PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6834 Py_ssize_t size,
6835 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006836{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 PyObject *repr = NULL;
6838 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006839
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006840#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006843 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844 else
6845#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006846 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006847
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 Py_XDECREF(repr);
6850 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006851 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852
6853#ifdef NEED_RETRY
6854 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 p += INT_MAX;
6856 size -= INT_MAX;
6857 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858 }
6859#endif
6860
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006861 return repr;
6862}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006863
Alexander Belopolsky40018472011-02-26 01:02:56 +00006864PyObject *
6865PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006866{
6867 if (!PyUnicode_Check(unicode)) {
6868 PyErr_BadArgument();
6869 return NULL;
6870 }
6871 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 PyUnicode_GET_SIZE(unicode),
6873 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006874}
6875
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006876#undef NEED_RETRY
6877
Victor Stinner99b95382011-07-04 14:23:54 +02006878#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880/* --- Character Mapping Codec -------------------------------------------- */
6881
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882PyObject *
6883PyUnicode_DecodeCharmap(const char *s,
6884 Py_ssize_t size,
6885 PyObject *mapping,
6886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006889 Py_ssize_t startinpos;
6890 Py_ssize_t endinpos;
6891 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 PyUnicodeObject *v;
6894 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 PyObject *errorHandler = NULL;
6897 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006898 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 /* Default to Latin-1 */
6902 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
6905 v = _PyUnicode_New(size);
6906 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006912 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 mapstring = PyUnicode_AS_UNICODE(mapping);
6914 maplen = PyUnicode_GET_SIZE(mapping);
6915 while (s < e) {
6916 unsigned char ch = *s;
6917 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 if (ch < maplen)
6920 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 if (x == 0xfffe) {
6923 /* undefined mapping */
6924 outpos = p-PyUnicode_AS_UNICODE(v);
6925 startinpos = s-starts;
6926 endinpos = startinpos+1;
6927 if (unicode_decode_call_errorhandler(
6928 errors, &errorHandler,
6929 "charmap", "character maps to <undefined>",
6930 &starts, &e, &startinpos, &endinpos, &exc, &s,
6931 &v, &outpos, &p)) {
6932 goto onError;
6933 }
6934 continue;
6935 }
6936 *p++ = x;
6937 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006939 }
6940 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 while (s < e) {
6942 unsigned char ch = *s;
6943 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006944
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6946 w = PyLong_FromLong((long)ch);
6947 if (w == NULL)
6948 goto onError;
6949 x = PyObject_GetItem(mapping, w);
6950 Py_DECREF(w);
6951 if (x == NULL) {
6952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6953 /* No mapping found means: mapping is undefined. */
6954 PyErr_Clear();
6955 x = Py_None;
6956 Py_INCREF(x);
6957 } else
6958 goto onError;
6959 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006960
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 /* Apply mapping */
6962 if (PyLong_Check(x)) {
6963 long value = PyLong_AS_LONG(x);
6964 if (value < 0 || value > 65535) {
6965 PyErr_SetString(PyExc_TypeError,
6966 "character mapping must be in range(65536)");
6967 Py_DECREF(x);
6968 goto onError;
6969 }
6970 *p++ = (Py_UNICODE)value;
6971 }
6972 else if (x == Py_None) {
6973 /* undefined mapping */
6974 outpos = p-PyUnicode_AS_UNICODE(v);
6975 startinpos = s-starts;
6976 endinpos = startinpos+1;
6977 if (unicode_decode_call_errorhandler(
6978 errors, &errorHandler,
6979 "charmap", "character maps to <undefined>",
6980 &starts, &e, &startinpos, &endinpos, &exc, &s,
6981 &v, &outpos, &p)) {
6982 Py_DECREF(x);
6983 goto onError;
6984 }
6985 Py_DECREF(x);
6986 continue;
6987 }
6988 else if (PyUnicode_Check(x)) {
6989 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 if (targetsize == 1)
6992 /* 1-1 mapping */
6993 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 else if (targetsize > 1) {
6996 /* 1-n mapping */
6997 if (targetsize > extrachars) {
6998 /* resize first */
6999 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7000 Py_ssize_t needed = (targetsize - extrachars) + \
7001 (targetsize << 2);
7002 extrachars += needed;
7003 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007004 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 PyUnicode_GET_SIZE(v) + needed) < 0) {
7006 Py_DECREF(x);
7007 goto onError;
7008 }
7009 p = PyUnicode_AS_UNICODE(v) + oldpos;
7010 }
7011 Py_UNICODE_COPY(p,
7012 PyUnicode_AS_UNICODE(x),
7013 targetsize);
7014 p += targetsize;
7015 extrachars -= targetsize;
7016 }
7017 /* 1-0 mapping: skip the character */
7018 }
7019 else {
7020 /* wrong return value */
7021 PyErr_SetString(PyExc_TypeError,
7022 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007023 Py_DECREF(x);
7024 goto onError;
7025 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 Py_DECREF(x);
7027 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 }
7030 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007031 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 Py_XDECREF(errorHandler);
7034 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007035#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007036 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037 Py_DECREF(v);
7038 return NULL;
7039 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007042
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 Py_XDECREF(errorHandler);
7045 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 Py_XDECREF(v);
7047 return NULL;
7048}
7049
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007050/* Charmap encoding: the lookup table */
7051
Alexander Belopolsky40018472011-02-26 01:02:56 +00007052struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 PyObject_HEAD
7054 unsigned char level1[32];
7055 int count2, count3;
7056 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007057};
7058
7059static PyObject*
7060encoding_map_size(PyObject *obj, PyObject* args)
7061{
7062 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007063 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007065}
7066
7067static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007068 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 PyDoc_STR("Return the size (in bytes) of this object") },
7070 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007071};
7072
7073static void
7074encoding_map_dealloc(PyObject* o)
7075{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007076 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007077}
7078
7079static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007080 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 "EncodingMap", /*tp_name*/
7082 sizeof(struct encoding_map), /*tp_basicsize*/
7083 0, /*tp_itemsize*/
7084 /* methods */
7085 encoding_map_dealloc, /*tp_dealloc*/
7086 0, /*tp_print*/
7087 0, /*tp_getattr*/
7088 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007089 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 0, /*tp_repr*/
7091 0, /*tp_as_number*/
7092 0, /*tp_as_sequence*/
7093 0, /*tp_as_mapping*/
7094 0, /*tp_hash*/
7095 0, /*tp_call*/
7096 0, /*tp_str*/
7097 0, /*tp_getattro*/
7098 0, /*tp_setattro*/
7099 0, /*tp_as_buffer*/
7100 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7101 0, /*tp_doc*/
7102 0, /*tp_traverse*/
7103 0, /*tp_clear*/
7104 0, /*tp_richcompare*/
7105 0, /*tp_weaklistoffset*/
7106 0, /*tp_iter*/
7107 0, /*tp_iternext*/
7108 encoding_map_methods, /*tp_methods*/
7109 0, /*tp_members*/
7110 0, /*tp_getset*/
7111 0, /*tp_base*/
7112 0, /*tp_dict*/
7113 0, /*tp_descr_get*/
7114 0, /*tp_descr_set*/
7115 0, /*tp_dictoffset*/
7116 0, /*tp_init*/
7117 0, /*tp_alloc*/
7118 0, /*tp_new*/
7119 0, /*tp_free*/
7120 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007121};
7122
7123PyObject*
7124PyUnicode_BuildEncodingMap(PyObject* string)
7125{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007126 PyObject *result;
7127 struct encoding_map *mresult;
7128 int i;
7129 int need_dict = 0;
7130 unsigned char level1[32];
7131 unsigned char level2[512];
7132 unsigned char *mlevel1, *mlevel2, *mlevel3;
7133 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007134 int kind;
7135 void *data;
7136 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007138 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007139 PyErr_BadArgument();
7140 return NULL;
7141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 kind = PyUnicode_KIND(string);
7143 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007144 memset(level1, 0xFF, sizeof level1);
7145 memset(level2, 0xFF, sizeof level2);
7146
7147 /* If there isn't a one-to-one mapping of NULL to \0,
7148 or if there are non-BMP characters, we need to use
7149 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007150 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007151 need_dict = 1;
7152 for (i = 1; i < 256; i++) {
7153 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007154 ch = PyUnicode_READ(kind, data, i);
7155 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007156 need_dict = 1;
7157 break;
7158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007159 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007160 /* unmapped character */
7161 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007162 l1 = ch >> 11;
7163 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007164 if (level1[l1] == 0xFF)
7165 level1[l1] = count2++;
7166 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007167 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168 }
7169
7170 if (count2 >= 0xFF || count3 >= 0xFF)
7171 need_dict = 1;
7172
7173 if (need_dict) {
7174 PyObject *result = PyDict_New();
7175 PyObject *key, *value;
7176 if (!result)
7177 return NULL;
7178 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007179 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007180 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007181 if (!key || !value)
7182 goto failed1;
7183 if (PyDict_SetItem(result, key, value) == -1)
7184 goto failed1;
7185 Py_DECREF(key);
7186 Py_DECREF(value);
7187 }
7188 return result;
7189 failed1:
7190 Py_XDECREF(key);
7191 Py_XDECREF(value);
7192 Py_DECREF(result);
7193 return NULL;
7194 }
7195
7196 /* Create a three-level trie */
7197 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7198 16*count2 + 128*count3 - 1);
7199 if (!result)
7200 return PyErr_NoMemory();
7201 PyObject_Init(result, &EncodingMapType);
7202 mresult = (struct encoding_map*)result;
7203 mresult->count2 = count2;
7204 mresult->count3 = count3;
7205 mlevel1 = mresult->level1;
7206 mlevel2 = mresult->level23;
7207 mlevel3 = mresult->level23 + 16*count2;
7208 memcpy(mlevel1, level1, 32);
7209 memset(mlevel2, 0xFF, 16*count2);
7210 memset(mlevel3, 0, 128*count3);
7211 count3 = 0;
7212 for (i = 1; i < 256; i++) {
7213 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007214 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007215 /* unmapped character */
7216 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007217 o1 = PyUnicode_READ(kind, data, i)>>11;
7218 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007219 i2 = 16*mlevel1[o1] + o2;
7220 if (mlevel2[i2] == 0xFF)
7221 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007222 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223 i3 = 128*mlevel2[i2] + o3;
7224 mlevel3[i3] = i;
7225 }
7226 return result;
7227}
7228
7229static int
7230encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7231{
7232 struct encoding_map *map = (struct encoding_map*)mapping;
7233 int l1 = c>>11;
7234 int l2 = (c>>7) & 0xF;
7235 int l3 = c & 0x7F;
7236 int i;
7237
7238#ifdef Py_UNICODE_WIDE
7239 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007241 }
7242#endif
7243 if (c == 0)
7244 return 0;
7245 /* level 1*/
7246 i = map->level1[l1];
7247 if (i == 0xFF) {
7248 return -1;
7249 }
7250 /* level 2*/
7251 i = map->level23[16*i+l2];
7252 if (i == 0xFF) {
7253 return -1;
7254 }
7255 /* level 3 */
7256 i = map->level23[16*map->count2 + 128*i + l3];
7257 if (i == 0) {
7258 return -1;
7259 }
7260 return i;
7261}
7262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263/* Lookup the character ch in the mapping. If the character
7264 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007265 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007266static PyObject *
7267charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268{
Christian Heimes217cfd12007-12-02 14:31:20 +00007269 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 PyObject *x;
7271
7272 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 x = PyObject_GetItem(mapping, w);
7275 Py_DECREF(w);
7276 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7278 /* No mapping found means: mapping is undefined. */
7279 PyErr_Clear();
7280 x = Py_None;
7281 Py_INCREF(x);
7282 return x;
7283 } else
7284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007286 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007288 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 long value = PyLong_AS_LONG(x);
7290 if (value < 0 || value > 255) {
7291 PyErr_SetString(PyExc_TypeError,
7292 "character mapping must be in range(256)");
7293 Py_DECREF(x);
7294 return NULL;
7295 }
7296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007298 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 /* wrong return value */
7302 PyErr_Format(PyExc_TypeError,
7303 "character mapping must return integer, bytes or None, not %.400s",
7304 x->ob_type->tp_name);
7305 Py_DECREF(x);
7306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 }
7308}
7309
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007310static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007311charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007312{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007313 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7314 /* exponentially overallocate to minimize reallocations */
7315 if (requiredsize < 2*outsize)
7316 requiredsize = 2*outsize;
7317 if (_PyBytes_Resize(outobj, requiredsize))
7318 return -1;
7319 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007320}
7321
Benjamin Peterson14339b62009-01-31 16:36:08 +00007322typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007326 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007327 space is available. Return a new reference to the object that
7328 was put in the output buffer, or Py_None, if the mapping was undefined
7329 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007330 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007331static charmapencode_result
7332charmapencode_output(Py_UNICODE c, PyObject *mapping,
7333 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007334{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007335 PyObject *rep;
7336 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007337 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338
Christian Heimes90aa7642007-12-19 02:45:37 +00007339 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007340 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007342 if (res == -1)
7343 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 if (outsize<requiredsize)
7345 if (charmapencode_resize(outobj, outpos, requiredsize))
7346 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007347 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 outstart[(*outpos)++] = (char)res;
7349 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007350 }
7351
7352 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007355 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 Py_DECREF(rep);
7357 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007358 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 if (PyLong_Check(rep)) {
7360 Py_ssize_t requiredsize = *outpos+1;
7361 if (outsize<requiredsize)
7362 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7363 Py_DECREF(rep);
7364 return enc_EXCEPTION;
7365 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007366 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 else {
7370 const char *repchars = PyBytes_AS_STRING(rep);
7371 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7372 Py_ssize_t requiredsize = *outpos+repsize;
7373 if (outsize<requiredsize)
7374 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7375 Py_DECREF(rep);
7376 return enc_EXCEPTION;
7377 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007378 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 memcpy(outstart + *outpos, repchars, repsize);
7380 *outpos += repsize;
7381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007383 Py_DECREF(rep);
7384 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385}
7386
7387/* handle an error in PyUnicode_EncodeCharmap
7388 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007389static int
7390charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007393 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007394 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007395{
7396 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007397 Py_ssize_t repsize;
7398 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007399 Py_UNICODE *uni2;
7400 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007401 Py_ssize_t collstartpos = *inpos;
7402 Py_ssize_t collendpos = *inpos+1;
7403 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007404 char *encoding = "charmap";
7405 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007406 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007408 /* find all unencodable characters */
7409 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007411 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 int res = encoding_map_lookup(p[collendpos], mapping);
7413 if (res != -1)
7414 break;
7415 ++collendpos;
7416 continue;
7417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007418
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 rep = charmapencode_lookup(p[collendpos], mapping);
7420 if (rep==NULL)
7421 return -1;
7422 else if (rep!=Py_None) {
7423 Py_DECREF(rep);
7424 break;
7425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007428 }
7429 /* cache callback name lookup
7430 * (if not done yet, i.e. it's the first error) */
7431 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 if ((errors==NULL) || (!strcmp(errors, "strict")))
7433 *known_errorHandler = 1;
7434 else if (!strcmp(errors, "replace"))
7435 *known_errorHandler = 2;
7436 else if (!strcmp(errors, "ignore"))
7437 *known_errorHandler = 3;
7438 else if (!strcmp(errors, "xmlcharrefreplace"))
7439 *known_errorHandler = 4;
7440 else
7441 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 }
7443 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007444 case 1: /* strict */
7445 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7446 return -1;
7447 case 2: /* replace */
7448 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 x = charmapencode_output('?', mapping, res, respos);
7450 if (x==enc_EXCEPTION) {
7451 return -1;
7452 }
7453 else if (x==enc_FAILED) {
7454 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7455 return -1;
7456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007457 }
7458 /* fall through */
7459 case 3: /* ignore */
7460 *inpos = collendpos;
7461 break;
7462 case 4: /* xmlcharrefreplace */
7463 /* generate replacement (temporarily (mis)uses p) */
7464 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 char buffer[2+29+1+1];
7466 char *cp;
7467 sprintf(buffer, "&#%d;", (int)p[collpos]);
7468 for (cp = buffer; *cp; ++cp) {
7469 x = charmapencode_output(*cp, mapping, res, respos);
7470 if (x==enc_EXCEPTION)
7471 return -1;
7472 else if (x==enc_FAILED) {
7473 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7474 return -1;
7475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007476 }
7477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 *inpos = collendpos;
7479 break;
7480 default:
7481 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 encoding, reason, p, size, exceptionObject,
7483 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007484 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007486 if (PyBytes_Check(repunicode)) {
7487 /* Directly copy bytes result to output. */
7488 Py_ssize_t outsize = PyBytes_Size(*res);
7489 Py_ssize_t requiredsize;
7490 repsize = PyBytes_Size(repunicode);
7491 requiredsize = *respos + repsize;
7492 if (requiredsize > outsize)
7493 /* Make room for all additional bytes. */
7494 if (charmapencode_resize(res, respos, requiredsize)) {
7495 Py_DECREF(repunicode);
7496 return -1;
7497 }
7498 memcpy(PyBytes_AsString(*res) + *respos,
7499 PyBytes_AsString(repunicode), repsize);
7500 *respos += repsize;
7501 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007502 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007503 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007505 /* generate replacement */
7506 repsize = PyUnicode_GET_SIZE(repunicode);
7507 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 x = charmapencode_output(*uni2, mapping, res, respos);
7509 if (x==enc_EXCEPTION) {
7510 return -1;
7511 }
7512 else if (x==enc_FAILED) {
7513 Py_DECREF(repunicode);
7514 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7515 return -1;
7516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007517 }
7518 *inpos = newpos;
7519 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007520 }
7521 return 0;
7522}
7523
Alexander Belopolsky40018472011-02-26 01:02:56 +00007524PyObject *
7525PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7526 Py_ssize_t size,
7527 PyObject *mapping,
7528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 /* output object */
7531 PyObject *res = NULL;
7532 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007535 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536 PyObject *errorHandler = NULL;
7537 PyObject *exc = NULL;
7538 /* the following variable is used for caching string comparisons
7539 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7540 * 3=ignore, 4=xmlcharrefreplace */
7541 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543 /* Default to Latin-1 */
7544 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007547 /* allocate enough for a simple encoding without
7548 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007549 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 if (res == NULL)
7551 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007552 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 /* try to encode it */
7557 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7558 if (x==enc_EXCEPTION) /* error */
7559 goto onError;
7560 if (x==enc_FAILED) { /* unencodable character */
7561 if (charmap_encoding_error(p, size, &inpos, mapping,
7562 &exc,
7563 &known_errorHandler, &errorHandler, errors,
7564 &res, &respos)) {
7565 goto onError;
7566 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007567 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 else
7569 /* done with this character => adjust input position */
7570 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007573 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007574 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007575 if (_PyBytes_Resize(&res, respos) < 0)
7576 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578 Py_XDECREF(exc);
7579 Py_XDECREF(errorHandler);
7580 return res;
7581
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 Py_XDECREF(res);
7584 Py_XDECREF(exc);
7585 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 return NULL;
7587}
7588
Alexander Belopolsky40018472011-02-26 01:02:56 +00007589PyObject *
7590PyUnicode_AsCharmapString(PyObject *unicode,
7591 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592{
7593 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 PyErr_BadArgument();
7595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 }
7597 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 PyUnicode_GET_SIZE(unicode),
7599 mapping,
7600 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601}
7602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007604static void
7605make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007606 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007607 Py_ssize_t startpos, Py_ssize_t endpos,
7608 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007610 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007611 *exceptionObject = _PyUnicodeTranslateError_Create(
7612 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
7614 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7616 goto onError;
7617 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7618 goto onError;
7619 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7620 goto onError;
7621 return;
7622 onError:
7623 Py_DECREF(*exceptionObject);
7624 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 }
7626}
7627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007629static void
7630raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007632 Py_ssize_t startpos, Py_ssize_t endpos,
7633 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634{
7635 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007636 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639}
7640
7641/* error handling callback helper:
7642 build arguments, call the callback and check the arguments,
7643 put the result into newpos and return the replacement string, which
7644 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007645static PyObject *
7646unicode_translate_call_errorhandler(const char *errors,
7647 PyObject **errorHandler,
7648 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007649 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007650 Py_ssize_t startpos, Py_ssize_t endpos,
7651 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007652{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007653 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007655 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656 PyObject *restuple;
7657 PyObject *resunicode;
7658
7659 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007663 }
7664
7665 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007669
7670 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007675 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 Py_DECREF(restuple);
7677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 }
7679 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 &resunicode, &i_newpos)) {
7681 Py_DECREF(restuple);
7682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007684 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007686 else
7687 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7690 Py_DECREF(restuple);
7691 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007693 Py_INCREF(resunicode);
7694 Py_DECREF(restuple);
7695 return resunicode;
7696}
7697
7698/* Lookup the character ch in the mapping and put the result in result,
7699 which must be decrefed by the caller.
7700 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007701static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007702charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703{
Christian Heimes217cfd12007-12-02 14:31:20 +00007704 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 PyObject *x;
7706
7707 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 x = PyObject_GetItem(mapping, w);
7710 Py_DECREF(w);
7711 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7713 /* No mapping found means: use 1:1 mapping. */
7714 PyErr_Clear();
7715 *result = NULL;
7716 return 0;
7717 } else
7718 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719 }
7720 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 *result = x;
7722 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007724 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 long value = PyLong_AS_LONG(x);
7726 long max = PyUnicode_GetMax();
7727 if (value < 0 || value > max) {
7728 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007729 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 Py_DECREF(x);
7731 return -1;
7732 }
7733 *result = x;
7734 return 0;
7735 }
7736 else if (PyUnicode_Check(x)) {
7737 *result = x;
7738 return 0;
7739 }
7740 else {
7741 /* wrong return value */
7742 PyErr_SetString(PyExc_TypeError,
7743 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 Py_DECREF(x);
7745 return -1;
7746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007747}
7748/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 if not reallocate and adjust various state variables.
7750 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007751static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007755 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007756 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 /* exponentially overallocate to minimize reallocations */
7758 if (requiredsize < 2 * oldsize)
7759 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007760 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7761 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007763 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007764 }
7765 return 0;
7766}
7767/* lookup the character, put the result in the output string and adjust
7768 various state variables. Return a new reference to the object that
7769 was put in the output buffer in *result, or Py_None, if the mapping was
7770 undefined (in which case no character was written).
7771 The called must decref result.
7772 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007773static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007774charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7775 PyObject *mapping, Py_UCS4 **output,
7776 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007779 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7780 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 }
7786 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007788 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007791 }
7792 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 Py_ssize_t repsize;
7794 if (PyUnicode_READY(*res) == -1)
7795 return -1;
7796 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 if (repsize==1) {
7798 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 }
7801 else if (repsize!=0) {
7802 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 Py_ssize_t requiredsize = *opos +
7804 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 Py_ssize_t i;
7807 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 for(i = 0; i < repsize; i++)
7810 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 }
7813 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 return 0;
7816}
7817
Alexander Belopolsky40018472011-02-26 01:02:56 +00007818PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007819_PyUnicode_TranslateCharmap(PyObject *input,
7820 PyObject *mapping,
7821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 /* input object */
7824 char *idata;
7825 Py_ssize_t size, i;
7826 int kind;
7827 /* output buffer */
7828 Py_UCS4 *output = NULL;
7829 Py_ssize_t osize;
7830 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007831 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 char *reason = "character maps to <undefined>";
7834 PyObject *errorHandler = NULL;
7835 PyObject *exc = NULL;
7836 /* the following variable is used for caching string comparisons
7837 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7838 * 3=ignore, 4=xmlcharrefreplace */
7839 int known_errorHandler = -1;
7840
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 PyErr_BadArgument();
7843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 if (PyUnicode_READY(input) == -1)
7847 return NULL;
7848 idata = (char*)PyUnicode_DATA(input);
7849 kind = PyUnicode_KIND(input);
7850 size = PyUnicode_GET_LENGTH(input);
7851 i = 0;
7852
7853 if (size == 0) {
7854 Py_INCREF(input);
7855 return input;
7856 }
7857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007858 /* allocate enough for a simple 1:1 translation without
7859 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 osize = size;
7861 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7862 opos = 0;
7863 if (output == NULL) {
7864 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 /* try to encode it */
7870 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 if (charmaptranslate_output(input, i, mapping,
7872 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 Py_XDECREF(x);
7874 goto onError;
7875 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007876 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 else { /* untranslatable character */
7880 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7881 Py_ssize_t repsize;
7882 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885 Py_ssize_t collstart = i;
7886 Py_ssize_t collend = i+1;
7887 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 while (collend < size) {
7891 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 goto onError;
7893 Py_XDECREF(x);
7894 if (x!=Py_None)
7895 break;
7896 ++collend;
7897 }
7898 /* cache callback name lookup
7899 * (if not done yet, i.e. it's the first error) */
7900 if (known_errorHandler==-1) {
7901 if ((errors==NULL) || (!strcmp(errors, "strict")))
7902 known_errorHandler = 1;
7903 else if (!strcmp(errors, "replace"))
7904 known_errorHandler = 2;
7905 else if (!strcmp(errors, "ignore"))
7906 known_errorHandler = 3;
7907 else if (!strcmp(errors, "xmlcharrefreplace"))
7908 known_errorHandler = 4;
7909 else
7910 known_errorHandler = 0;
7911 }
7912 switch (known_errorHandler) {
7913 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 raise_translate_exception(&exc, input, collstart,
7915 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007916 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 case 2: /* replace */
7918 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 for (coll = collstart; coll<collend; coll++)
7920 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 /* fall through */
7922 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 break;
7925 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 /* generate replacement (temporarily (mis)uses i) */
7927 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 char buffer[2+29+1+1];
7929 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7931 if (charmaptranslate_makespace(&output, &osize,
7932 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 goto onError;
7934 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 break;
7939 default:
7940 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007941 reason, input, &exc,
7942 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007943 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 goto onError;
7945 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 repsize = PyUnicode_GET_LENGTH(repunicode);
7947 if (charmaptranslate_makespace(&output, &osize,
7948 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 Py_DECREF(repunicode);
7950 goto onError;
7951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007952 for (uni2 = 0; repsize-->0; ++uni2)
7953 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7954 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007956 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 }
7958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007959 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7960 if (!res)
7961 goto onError;
7962 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 Py_XDECREF(exc);
7964 Py_XDECREF(errorHandler);
7965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 Py_XDECREF(exc);
7970 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 return NULL;
7972}
7973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974/* Deprecated. Use PyUnicode_Translate instead. */
7975PyObject *
7976PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7977 Py_ssize_t size,
7978 PyObject *mapping,
7979 const char *errors)
7980{
7981 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7982 if (!unicode)
7983 return NULL;
7984 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7985}
7986
Alexander Belopolsky40018472011-02-26 01:02:56 +00007987PyObject *
7988PyUnicode_Translate(PyObject *str,
7989 PyObject *mapping,
7990 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991{
7992 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 str = PyUnicode_FromObject(str);
7995 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007997 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 Py_DECREF(str);
7999 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008000
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 Py_XDECREF(str);
8003 return NULL;
8004}
Tim Petersced69f82003-09-16 20:30:58 +00008005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006static Py_UCS4
8007fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8008{
8009 /* No need to call PyUnicode_READY(self) because this function is only
8010 called as a callback from fixup() which does it already. */
8011 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8012 const int kind = PyUnicode_KIND(self);
8013 void *data = PyUnicode_DATA(self);
8014 Py_UCS4 maxchar = 0, ch, fixed;
8015 Py_ssize_t i;
8016
8017 for (i = 0; i < len; ++i) {
8018 ch = PyUnicode_READ(kind, data, i);
8019 fixed = 0;
8020 if (ch > 127) {
8021 if (Py_UNICODE_ISSPACE(ch))
8022 fixed = ' ';
8023 else {
8024 const int decimal = Py_UNICODE_TODECIMAL(ch);
8025 if (decimal >= 0)
8026 fixed = '0' + decimal;
8027 }
8028 if (fixed != 0) {
8029 if (fixed > maxchar)
8030 maxchar = fixed;
8031 PyUnicode_WRITE(kind, data, i, fixed);
8032 }
8033 else if (ch > maxchar)
8034 maxchar = ch;
8035 }
8036 else if (ch > maxchar)
8037 maxchar = ch;
8038 }
8039
8040 return maxchar;
8041}
8042
8043PyObject *
8044_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8045{
8046 if (!PyUnicode_Check(unicode)) {
8047 PyErr_BadInternalCall();
8048 return NULL;
8049 }
8050 if (PyUnicode_READY(unicode) == -1)
8051 return NULL;
8052 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8053 /* If the string is already ASCII, just return the same string */
8054 Py_INCREF(unicode);
8055 return unicode;
8056 }
8057 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8058}
8059
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008060PyObject *
8061PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8062 Py_ssize_t length)
8063{
8064 PyObject *result;
8065 Py_UNICODE *p; /* write pointer into result */
8066 Py_ssize_t i;
8067 /* Copy to a new string */
8068 result = (PyObject *)_PyUnicode_New(length);
8069 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8070 if (result == NULL)
8071 return result;
8072 p = PyUnicode_AS_UNICODE(result);
8073 /* Iterate over code points */
8074 for (i = 0; i < length; i++) {
8075 Py_UNICODE ch =s[i];
8076 if (ch > 127) {
8077 int decimal = Py_UNICODE_TODECIMAL(ch);
8078 if (decimal >= 0)
8079 p[i] = '0' + decimal;
8080 }
8081 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008082#ifndef DONT_MAKE_RESULT_READY
8083 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 Py_DECREF(result);
8085 return NULL;
8086 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008087#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008088 return result;
8089}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008090/* --- Decimal Encoder ---------------------------------------------------- */
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092int
8093PyUnicode_EncodeDecimal(Py_UNICODE *s,
8094 Py_ssize_t length,
8095 char *output,
8096 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008097{
8098 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 PyObject *errorHandler = NULL;
8100 PyObject *exc = NULL;
8101 const char *encoding = "decimal";
8102 const char *reason = "invalid decimal Unicode string";
8103 /* the following variable is used for caching string comparisons
8104 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8105 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008106
8107 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 PyErr_BadArgument();
8109 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008110 }
8111
8112 p = s;
8113 end = s + length;
8114 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 register Py_UNICODE ch = *p;
8116 int decimal;
8117 PyObject *repunicode;
8118 Py_ssize_t repsize;
8119 Py_ssize_t newpos;
8120 Py_UNICODE *uni2;
8121 Py_UNICODE *collstart;
8122 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008123
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 ++p;
8127 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 decimal = Py_UNICODE_TODECIMAL(ch);
8130 if (decimal >= 0) {
8131 *output++ = '0' + decimal;
8132 ++p;
8133 continue;
8134 }
8135 if (0 < ch && ch < 256) {
8136 *output++ = (char)ch;
8137 ++p;
8138 continue;
8139 }
8140 /* All other characters are considered unencodable */
8141 collstart = p;
8142 collend = p+1;
8143 while (collend < end) {
8144 if ((0 < *collend && *collend < 256) ||
8145 !Py_UNICODE_ISSPACE(*collend) ||
8146 Py_UNICODE_TODECIMAL(*collend))
8147 break;
8148 }
8149 /* cache callback name lookup
8150 * (if not done yet, i.e. it's the first error) */
8151 if (known_errorHandler==-1) {
8152 if ((errors==NULL) || (!strcmp(errors, "strict")))
8153 known_errorHandler = 1;
8154 else if (!strcmp(errors, "replace"))
8155 known_errorHandler = 2;
8156 else if (!strcmp(errors, "ignore"))
8157 known_errorHandler = 3;
8158 else if (!strcmp(errors, "xmlcharrefreplace"))
8159 known_errorHandler = 4;
8160 else
8161 known_errorHandler = 0;
8162 }
8163 switch (known_errorHandler) {
8164 case 1: /* strict */
8165 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8166 goto onError;
8167 case 2: /* replace */
8168 for (p = collstart; p < collend; ++p)
8169 *output++ = '?';
8170 /* fall through */
8171 case 3: /* ignore */
8172 p = collend;
8173 break;
8174 case 4: /* xmlcharrefreplace */
8175 /* generate replacement (temporarily (mis)uses p) */
8176 for (p = collstart; p < collend; ++p)
8177 output += sprintf(output, "&#%d;", (int)*p);
8178 p = collend;
8179 break;
8180 default:
8181 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8182 encoding, reason, s, length, &exc,
8183 collstart-s, collend-s, &newpos);
8184 if (repunicode == NULL)
8185 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008186 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008187 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008188 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8189 Py_DECREF(repunicode);
8190 goto onError;
8191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 /* generate replacement */
8193 repsize = PyUnicode_GET_SIZE(repunicode);
8194 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8195 Py_UNICODE ch = *uni2;
8196 if (Py_UNICODE_ISSPACE(ch))
8197 *output++ = ' ';
8198 else {
8199 decimal = Py_UNICODE_TODECIMAL(ch);
8200 if (decimal >= 0)
8201 *output++ = '0' + decimal;
8202 else if (0 < ch && ch < 256)
8203 *output++ = (char)ch;
8204 else {
8205 Py_DECREF(repunicode);
8206 raise_encode_exception(&exc, encoding,
8207 s, length, collstart-s, collend-s, reason);
8208 goto onError;
8209 }
8210 }
8211 }
8212 p = s + newpos;
8213 Py_DECREF(repunicode);
8214 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008215 }
8216 /* 0-terminate the output string */
8217 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 Py_XDECREF(exc);
8219 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008220 return 0;
8221
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 Py_XDECREF(exc);
8224 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008225 return -1;
8226}
8227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228/* --- Helpers ------------------------------------------------------------ */
8229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230#include "stringlib/ucs1lib.h"
8231#include "stringlib/fastsearch.h"
8232#include "stringlib/partition.h"
8233#include "stringlib/split.h"
8234#include "stringlib/count.h"
8235#include "stringlib/find.h"
8236#include "stringlib/localeutil.h"
8237#include "stringlib/undef.h"
8238
8239#include "stringlib/ucs2lib.h"
8240#include "stringlib/fastsearch.h"
8241#include "stringlib/partition.h"
8242#include "stringlib/split.h"
8243#include "stringlib/count.h"
8244#include "stringlib/find.h"
8245#include "stringlib/localeutil.h"
8246#include "stringlib/undef.h"
8247
8248#include "stringlib/ucs4lib.h"
8249#include "stringlib/fastsearch.h"
8250#include "stringlib/partition.h"
8251#include "stringlib/split.h"
8252#include "stringlib/count.h"
8253#include "stringlib/find.h"
8254#include "stringlib/localeutil.h"
8255#include "stringlib/undef.h"
8256
8257static Py_ssize_t
8258any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8259 const Py_UCS1*, Py_ssize_t,
8260 Py_ssize_t, Py_ssize_t),
8261 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8262 const Py_UCS2*, Py_ssize_t,
8263 Py_ssize_t, Py_ssize_t),
8264 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8265 const Py_UCS4*, Py_ssize_t,
8266 Py_ssize_t, Py_ssize_t),
8267 PyObject* s1, PyObject* s2,
8268 Py_ssize_t start,
8269 Py_ssize_t end)
8270{
8271 int kind1, kind2, kind;
8272 void *buf1, *buf2;
8273 Py_ssize_t len1, len2, result;
8274
8275 kind1 = PyUnicode_KIND(s1);
8276 kind2 = PyUnicode_KIND(s2);
8277 kind = kind1 > kind2 ? kind1 : kind2;
8278 buf1 = PyUnicode_DATA(s1);
8279 buf2 = PyUnicode_DATA(s2);
8280 if (kind1 != kind)
8281 buf1 = _PyUnicode_AsKind(s1, kind);
8282 if (!buf1)
8283 return -2;
8284 if (kind2 != kind)
8285 buf2 = _PyUnicode_AsKind(s2, kind);
8286 if (!buf2) {
8287 if (kind1 != kind) PyMem_Free(buf1);
8288 return -2;
8289 }
8290 len1 = PyUnicode_GET_LENGTH(s1);
8291 len2 = PyUnicode_GET_LENGTH(s2);
8292
8293 switch(kind) {
8294 case PyUnicode_1BYTE_KIND:
8295 result = ucs1(buf1, len1, buf2, len2, start, end);
8296 break;
8297 case PyUnicode_2BYTE_KIND:
8298 result = ucs2(buf1, len1, buf2, len2, start, end);
8299 break;
8300 case PyUnicode_4BYTE_KIND:
8301 result = ucs4(buf1, len1, buf2, len2, start, end);
8302 break;
8303 default:
8304 assert(0); result = -2;
8305 }
8306
8307 if (kind1 != kind)
8308 PyMem_Free(buf1);
8309 if (kind2 != kind)
8310 PyMem_Free(buf2);
8311
8312 return result;
8313}
8314
8315Py_ssize_t
8316_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8317 Py_ssize_t n_buffer,
8318 void *digits, Py_ssize_t n_digits,
8319 Py_ssize_t min_width,
8320 const char *grouping,
8321 const char *thousands_sep)
8322{
8323 switch(kind) {
8324 case PyUnicode_1BYTE_KIND:
8325 return _PyUnicode_ucs1_InsertThousandsGrouping(
8326 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8327 min_width, grouping, thousands_sep);
8328 case PyUnicode_2BYTE_KIND:
8329 return _PyUnicode_ucs2_InsertThousandsGrouping(
8330 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8331 min_width, grouping, thousands_sep);
8332 case PyUnicode_4BYTE_KIND:
8333 return _PyUnicode_ucs4_InsertThousandsGrouping(
8334 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8335 min_width, grouping, thousands_sep);
8336 }
8337 assert(0);
8338 return -1;
8339}
8340
8341
Eric Smith8c663262007-08-25 02:26:07 +00008342#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008343#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008344
Thomas Wouters477c8d52006-05-27 19:21:47 +00008345#include "stringlib/count.h"
8346#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008347
Thomas Wouters477c8d52006-05-27 19:21:47 +00008348/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008349#define ADJUST_INDICES(start, end, len) \
8350 if (end > len) \
8351 end = len; \
8352 else if (end < 0) { \
8353 end += len; \
8354 if (end < 0) \
8355 end = 0; \
8356 } \
8357 if (start < 0) { \
8358 start += len; \
8359 if (start < 0) \
8360 start = 0; \
8361 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008362
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363Py_ssize_t
8364PyUnicode_Count(PyObject *str,
8365 PyObject *substr,
8366 Py_ssize_t start,
8367 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008369 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008370 PyUnicodeObject* str_obj;
8371 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 int kind1, kind2, kind;
8373 void *buf1 = NULL, *buf2 = NULL;
8374 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008375
Thomas Wouters477c8d52006-05-27 19:21:47 +00008376 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008380 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 Py_DECREF(str_obj);
8382 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Tim Petersced69f82003-09-16 20:30:58 +00008384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 kind1 = PyUnicode_KIND(str_obj);
8386 kind2 = PyUnicode_KIND(sub_obj);
8387 kind = kind1 > kind2 ? kind1 : kind2;
8388 buf1 = PyUnicode_DATA(str_obj);
8389 if (kind1 != kind)
8390 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8391 if (!buf1)
8392 goto onError;
8393 buf2 = PyUnicode_DATA(sub_obj);
8394 if (kind2 != kind)
8395 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8396 if (!buf2)
8397 goto onError;
8398 len1 = PyUnicode_GET_LENGTH(str_obj);
8399 len2 = PyUnicode_GET_LENGTH(sub_obj);
8400
8401 ADJUST_INDICES(start, end, len1);
8402 switch(kind) {
8403 case PyUnicode_1BYTE_KIND:
8404 result = ucs1lib_count(
8405 ((Py_UCS1*)buf1) + start, end - start,
8406 buf2, len2, PY_SSIZE_T_MAX
8407 );
8408 break;
8409 case PyUnicode_2BYTE_KIND:
8410 result = ucs2lib_count(
8411 ((Py_UCS2*)buf1) + start, end - start,
8412 buf2, len2, PY_SSIZE_T_MAX
8413 );
8414 break;
8415 case PyUnicode_4BYTE_KIND:
8416 result = ucs4lib_count(
8417 ((Py_UCS4*)buf1) + start, end - start,
8418 buf2, len2, PY_SSIZE_T_MAX
8419 );
8420 break;
8421 default:
8422 assert(0); result = 0;
8423 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424
8425 Py_DECREF(sub_obj);
8426 Py_DECREF(str_obj);
8427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 if (kind1 != kind)
8429 PyMem_Free(buf1);
8430 if (kind2 != kind)
8431 PyMem_Free(buf2);
8432
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 onError:
8435 Py_DECREF(sub_obj);
8436 Py_DECREF(str_obj);
8437 if (kind1 != kind && buf1)
8438 PyMem_Free(buf1);
8439 if (kind2 != kind && buf2)
8440 PyMem_Free(buf2);
8441 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442}
8443
Alexander Belopolsky40018472011-02-26 01:02:56 +00008444Py_ssize_t
8445PyUnicode_Find(PyObject *str,
8446 PyObject *sub,
8447 Py_ssize_t start,
8448 Py_ssize_t end,
8449 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008451 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008452
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008456 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 Py_DECREF(str);
8459 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 }
Tim Petersced69f82003-09-16 20:30:58 +00008461
Thomas Wouters477c8d52006-05-27 19:21:47 +00008462 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 result = any_find_slice(
8464 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8465 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008466 );
8467 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 result = any_find_slice(
8469 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8470 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008471 );
8472
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008474 Py_DECREF(sub);
8475
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 return result;
8477}
8478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479Py_ssize_t
8480PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8481 Py_ssize_t start, Py_ssize_t end,
8482 int direction)
8483{
8484 char *result;
8485 int kind;
8486 if (PyUnicode_READY(str) == -1)
8487 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008488 if (start < 0 || end < 0) {
8489 PyErr_SetString(PyExc_IndexError, "string index out of range");
8490 return -2;
8491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 if (end > PyUnicode_GET_LENGTH(str))
8493 end = PyUnicode_GET_LENGTH(str);
8494 kind = PyUnicode_KIND(str);
8495 result = findchar(PyUnicode_1BYTE_DATA(str)
8496 + PyUnicode_KIND_SIZE(kind, start),
8497 kind,
8498 end-start, ch, direction);
8499 if (!result)
8500 return -1;
8501 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8502}
8503
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504static int
8505tailmatch(PyUnicodeObject *self,
8506 PyUnicodeObject *substring,
8507 Py_ssize_t start,
8508 Py_ssize_t end,
8509 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 int kind_self;
8512 int kind_sub;
8513 void *data_self;
8514 void *data_sub;
8515 Py_ssize_t offset;
8516 Py_ssize_t i;
8517 Py_ssize_t end_sub;
8518
8519 if (PyUnicode_READY(self) == -1 ||
8520 PyUnicode_READY(substring) == -1)
8521 return 0;
8522
8523 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 return 1;
8525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8527 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 kind_self = PyUnicode_KIND(self);
8532 data_self = PyUnicode_DATA(self);
8533 kind_sub = PyUnicode_KIND(substring);
8534 data_sub = PyUnicode_DATA(substring);
8535 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8536
8537 if (direction > 0)
8538 offset = end;
8539 else
8540 offset = start;
8541
8542 if (PyUnicode_READ(kind_self, data_self, offset) ==
8543 PyUnicode_READ(kind_sub, data_sub, 0) &&
8544 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8545 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8546 /* If both are of the same kind, memcmp is sufficient */
8547 if (kind_self == kind_sub) {
8548 return ! memcmp((char *)data_self +
8549 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8550 data_sub,
8551 PyUnicode_GET_LENGTH(substring) *
8552 PyUnicode_CHARACTER_SIZE(substring));
8553 }
8554 /* otherwise we have to compare each character by first accesing it */
8555 else {
8556 /* We do not need to compare 0 and len(substring)-1 because
8557 the if statement above ensured already that they are equal
8558 when we end up here. */
8559 // TODO: honor direction and do a forward or backwards search
8560 for (i = 1; i < end_sub; ++i) {
8561 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8562 PyUnicode_READ(kind_sub, data_sub, i))
8563 return 0;
8564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 }
8568
8569 return 0;
8570}
8571
Alexander Belopolsky40018472011-02-26 01:02:56 +00008572Py_ssize_t
8573PyUnicode_Tailmatch(PyObject *str,
8574 PyObject *substr,
8575 Py_ssize_t start,
8576 Py_ssize_t end,
8577 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008579 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008580
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 str = PyUnicode_FromObject(str);
8582 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 substr = PyUnicode_FromObject(substr);
8585 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 Py_DECREF(str);
8587 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 }
Tim Petersced69f82003-09-16 20:30:58 +00008589
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 (PyUnicodeObject *)substr,
8592 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 Py_DECREF(str);
8594 Py_DECREF(substr);
8595 return result;
8596}
8597
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598/* Apply fixfct filter to the Unicode object self and return a
8599 reference to the modified object */
8600
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601static PyObject *
8602fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 PyObject *u;
8606 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 if (PyUnicode_READY(self) == -1)
8609 return NULL;
8610 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8611 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8612 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8617 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 /* fix functions return the new maximum character in a string,
8620 if the kind of the resulting unicode object does not change,
8621 everything is fine. Otherwise we need to change the string kind
8622 and re-run the fix function. */
8623 maxchar_new = fixfct((PyUnicodeObject*)u);
8624 if (maxchar_new == 0)
8625 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8626 else if (maxchar_new <= 127)
8627 maxchar_new = 127;
8628 else if (maxchar_new <= 255)
8629 maxchar_new = 255;
8630 else if (maxchar_new <= 65535)
8631 maxchar_new = 65535;
8632 else
8633 maxchar_new = 1114111; /* 0x10ffff */
8634
8635 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 /* fixfct should return TRUE if it modified the buffer. If
8637 FALSE, return a reference to the original buffer instead
8638 (to save space, not time) */
8639 Py_INCREF(self);
8640 Py_DECREF(u);
8641 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 else if (maxchar_new == maxchar_old) {
8644 return u;
8645 }
8646 else {
8647 /* In case the maximum character changed, we need to
8648 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008649 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 if (v == NULL) {
8651 Py_DECREF(u);
8652 return NULL;
8653 }
8654 if (maxchar_new > maxchar_old) {
8655 /* If the maxchar increased so that the kind changed, not all
8656 characters are representable anymore and we need to fix the
8657 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008658 if (PyUnicode_CopyCharacters(v, 0,
8659 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008660 PyUnicode_GET_LENGTH(self)) < 0)
8661 {
8662 Py_DECREF(u);
8663 return NULL;
8664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 maxchar_old = fixfct((PyUnicodeObject*)v);
8666 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8667 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008668 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008669 if (PyUnicode_CopyCharacters(v, 0,
8670 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008671 PyUnicode_GET_LENGTH(self)) < 0)
8672 {
8673 Py_DECREF(u);
8674 return NULL;
8675 }
8676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677
8678 Py_DECREF(u);
8679 return v;
8680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681}
8682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 /* No need to call PyUnicode_READY(self) because this function is only
8687 called as a callback from fixup() which does it already. */
8688 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8689 const int kind = PyUnicode_KIND(self);
8690 void *data = PyUnicode_DATA(self);
8691 int touched = 0;
8692 Py_UCS4 maxchar = 0;
8693 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 for (i = 0; i < len; ++i) {
8696 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8697 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8698 if (up != ch) {
8699 if (up > maxchar)
8700 maxchar = up;
8701 PyUnicode_WRITE(kind, data, i, up);
8702 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 else if (ch > maxchar)
8705 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 }
8707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 if (touched)
8709 return maxchar;
8710 else
8711 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712}
8713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8718 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8719 const int kind = PyUnicode_KIND(self);
8720 void *data = PyUnicode_DATA(self);
8721 int touched = 0;
8722 Py_UCS4 maxchar = 0;
8723 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 for(i = 0; i < len; ++i) {
8726 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8727 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8728 if (lo != ch) {
8729 if (lo > maxchar)
8730 maxchar = lo;
8731 PyUnicode_WRITE(kind, data, i, lo);
8732 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 else if (ch > maxchar)
8735 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 }
8737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 if (touched)
8739 return maxchar;
8740 else
8741 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742}
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8748 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8749 const int kind = PyUnicode_KIND(self);
8750 void *data = PyUnicode_DATA(self);
8751 int touched = 0;
8752 Py_UCS4 maxchar = 0;
8753 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 for(i = 0; i < len; ++i) {
8756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8757 Py_UCS4 nu = 0;
8758
8759 if (Py_UNICODE_ISUPPER(ch))
8760 nu = Py_UNICODE_TOLOWER(ch);
8761 else if (Py_UNICODE_ISLOWER(ch))
8762 nu = Py_UNICODE_TOUPPER(ch);
8763
8764 if (nu != 0) {
8765 if (nu > maxchar)
8766 maxchar = nu;
8767 PyUnicode_WRITE(kind, data, i, nu);
8768 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 else if (ch > maxchar)
8771 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 }
8773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 if (touched)
8775 return maxchar;
8776 else
8777 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778}
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008781fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8784 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8785 const int kind = PyUnicode_KIND(self);
8786 void *data = PyUnicode_DATA(self);
8787 int touched = 0;
8788 Py_UCS4 maxchar = 0;
8789 Py_ssize_t i = 0;
8790 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008791
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008792 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794
8795 ch = PyUnicode_READ(kind, data, i);
8796 if (!Py_UNICODE_ISUPPER(ch)) {
8797 maxchar = Py_UNICODE_TOUPPER(ch);
8798 PyUnicode_WRITE(kind, data, i, maxchar);
8799 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 ++i;
8802 for(; i < len; ++i) {
8803 ch = PyUnicode_READ(kind, data, i);
8804 if (!Py_UNICODE_ISLOWER(ch)) {
8805 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8806 if (lo > maxchar)
8807 maxchar = lo;
8808 PyUnicode_WRITE(kind, data, i, lo);
8809 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 else if (ch > maxchar)
8812 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814
8815 if (touched)
8816 return maxchar;
8817 else
8818 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819}
8820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008822fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8825 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8826 const int kind = PyUnicode_KIND(self);
8827 void *data = PyUnicode_DATA(self);
8828 Py_UCS4 maxchar = 0;
8829 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 int previous_is_cased;
8831
8832 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 if (len == 1) {
8834 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8835 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8836 if (ti != ch) {
8837 PyUnicode_WRITE(kind, data, i, ti);
8838 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 }
8840 else
8841 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 for(; i < len; ++i) {
8845 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8846 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008847
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 nu = Py_UNICODE_TOTITLE(ch);
8852
8853 if (nu > maxchar)
8854 maxchar = nu;
8855 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008856
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 if (Py_UNICODE_ISLOWER(ch) ||
8858 Py_UNICODE_ISUPPER(ch) ||
8859 Py_UNICODE_ISTITLE(ch))
8860 previous_is_cased = 1;
8861 else
8862 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865}
8866
Tim Peters8ce9f162004-08-27 01:49:32 +00008867PyObject *
8868PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008871 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008873 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008874 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8875 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008876 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 Py_ssize_t sz, i, res_offset;
8878 Py_UCS4 maxchar = 0;
8879 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880
Tim Peters05eba1f2004-08-27 21:32:02 +00008881 fseq = PySequence_Fast(seq, "");
8882 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008884 }
8885
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008886 /* NOTE: the following code can't call back into Python code,
8887 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008888 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008889
Tim Peters05eba1f2004-08-27 21:32:02 +00008890 seqlen = PySequence_Fast_GET_SIZE(fseq);
8891 /* If empty sequence, return u"". */
8892 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008894 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008895 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008896 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008897 /* If singleton sequence with an exact Unicode, return that. */
8898 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 item = items[0];
8900 if (PyUnicode_CheckExact(item)) {
8901 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 goto Done;
8904 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008905 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008906 else {
8907 /* Set up sep and seplen */
8908 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 /* fall back to a blank space separator */
8910 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008911 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008913 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008914 else {
8915 if (!PyUnicode_Check(separator)) {
8916 PyErr_Format(PyExc_TypeError,
8917 "separator: expected str instance,"
8918 " %.80s found",
8919 Py_TYPE(separator)->tp_name);
8920 goto onError;
8921 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008922 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 goto onError;
8924 sep = separator;
8925 seplen = PyUnicode_GET_LENGTH(separator);
8926 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8927 /* inc refcount to keep this code path symetric with the
8928 above case of a blank separator */
8929 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008930 }
8931 }
8932
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008933 /* There are at least two things to join, or else we have a subclass
8934 * of str in the sequence.
8935 * Do a pre-pass to figure out the total amount of space we'll
8936 * need (sz), and see whether all argument are strings.
8937 */
8938 sz = 0;
8939 for (i = 0; i < seqlen; i++) {
8940 const Py_ssize_t old_sz = sz;
8941 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 if (!PyUnicode_Check(item)) {
8943 PyErr_Format(PyExc_TypeError,
8944 "sequence item %zd: expected str instance,"
8945 " %.80s found",
8946 i, Py_TYPE(item)->tp_name);
8947 goto onError;
8948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 if (PyUnicode_READY(item) == -1)
8950 goto onError;
8951 sz += PyUnicode_GET_LENGTH(item);
8952 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8953 if (item_maxchar > maxchar)
8954 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008955 if (i != 0)
8956 sz += seplen;
8957 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8958 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008960 goto onError;
8961 }
8962 }
Tim Petersced69f82003-09-16 20:30:58 +00008963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008965 if (res == NULL)
8966 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008967
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008968 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008970 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008971 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008973 if (i && seplen != 0) {
8974 copied = PyUnicode_CopyCharacters(res, res_offset,
8975 sep, 0, seplen);
8976 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008977 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008978#ifdef Py_DEBUG
8979 res_offset += copied;
8980#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008982#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008984 itemlen = PyUnicode_GET_LENGTH(item);
8985 if (itemlen != 0) {
8986 copied = PyUnicode_CopyCharacters(res, res_offset,
8987 item, 0, itemlen);
8988 if (copied < 0)
8989 goto onError;
8990#ifdef Py_DEBUG
8991 res_offset += copied;
8992#else
8993 res_offset += itemlen;
8994#endif
8995 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008998
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009000 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 Py_XDECREF(sep);
9002 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009005 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009007 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 return NULL;
9009}
9010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011#define FILL(kind, data, value, start, length) \
9012 do { \
9013 Py_ssize_t i_ = 0; \
9014 assert(kind != PyUnicode_WCHAR_KIND); \
9015 switch ((kind)) { \
9016 case PyUnicode_1BYTE_KIND: { \
9017 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9018 memset(to_, (unsigned char)value, length); \
9019 break; \
9020 } \
9021 case PyUnicode_2BYTE_KIND: { \
9022 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9023 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9024 break; \
9025 } \
9026 default: { \
9027 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9028 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9029 break; \
9030 } \
9031 } \
9032 } while (0)
9033
Alexander Belopolsky40018472011-02-26 01:02:56 +00009034static PyUnicodeObject *
9035pad(PyUnicodeObject *self,
9036 Py_ssize_t left,
9037 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 PyObject *u;
9041 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009042 int kind;
9043 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044
9045 if (left < 0)
9046 left = 0;
9047 if (right < 0)
9048 right = 0;
9049
Tim Peters7a29bd52001-09-12 03:03:31 +00009050 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 Py_INCREF(self);
9052 return self;
9053 }
9054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9056 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009057 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9058 return NULL;
9059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9061 if (fill > maxchar)
9062 maxchar = fill;
9063 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009064 if (!u)
9065 return NULL;
9066
9067 kind = PyUnicode_KIND(u);
9068 data = PyUnicode_DATA(u);
9069 if (left)
9070 FILL(kind, data, fill, 0, left);
9071 if (right)
9072 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009073 if (PyUnicode_CopyCharacters(u, left,
9074 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009075 _PyUnicode_LENGTH(self)) < 0)
9076 {
9077 Py_DECREF(u);
9078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
9080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
Alexander Belopolsky40018472011-02-26 01:02:56 +00009085PyObject *
9086PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
9090 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 switch(PyUnicode_KIND(string)) {
9095 case PyUnicode_1BYTE_KIND:
9096 list = ucs1lib_splitlines(
9097 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9098 PyUnicode_GET_LENGTH(string), keepends);
9099 break;
9100 case PyUnicode_2BYTE_KIND:
9101 list = ucs2lib_splitlines(
9102 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9103 PyUnicode_GET_LENGTH(string), keepends);
9104 break;
9105 case PyUnicode_4BYTE_KIND:
9106 list = ucs4lib_splitlines(
9107 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9108 PyUnicode_GET_LENGTH(string), keepends);
9109 break;
9110 default:
9111 assert(0);
9112 list = 0;
9113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 Py_DECREF(string);
9115 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116}
9117
Alexander Belopolsky40018472011-02-26 01:02:56 +00009118static PyObject *
9119split(PyUnicodeObject *self,
9120 PyUnicodeObject *substring,
9121 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 int kind1, kind2, kind;
9124 void *buf1, *buf2;
9125 Py_ssize_t len1, len2;
9126 PyObject* out;
9127
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009129 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 if (PyUnicode_READY(self) == -1)
9132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 if (substring == NULL)
9135 switch(PyUnicode_KIND(self)) {
9136 case PyUnicode_1BYTE_KIND:
9137 return ucs1lib_split_whitespace(
9138 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9139 PyUnicode_GET_LENGTH(self), maxcount
9140 );
9141 case PyUnicode_2BYTE_KIND:
9142 return ucs2lib_split_whitespace(
9143 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9144 PyUnicode_GET_LENGTH(self), maxcount
9145 );
9146 case PyUnicode_4BYTE_KIND:
9147 return ucs4lib_split_whitespace(
9148 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9149 PyUnicode_GET_LENGTH(self), maxcount
9150 );
9151 default:
9152 assert(0);
9153 return NULL;
9154 }
9155
9156 if (PyUnicode_READY(substring) == -1)
9157 return NULL;
9158
9159 kind1 = PyUnicode_KIND(self);
9160 kind2 = PyUnicode_KIND(substring);
9161 kind = kind1 > kind2 ? kind1 : kind2;
9162 buf1 = PyUnicode_DATA(self);
9163 buf2 = PyUnicode_DATA(substring);
9164 if (kind1 != kind)
9165 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9166 if (!buf1)
9167 return NULL;
9168 if (kind2 != kind)
9169 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9170 if (!buf2) {
9171 if (kind1 != kind) PyMem_Free(buf1);
9172 return NULL;
9173 }
9174 len1 = PyUnicode_GET_LENGTH(self);
9175 len2 = PyUnicode_GET_LENGTH(substring);
9176
9177 switch(kind) {
9178 case PyUnicode_1BYTE_KIND:
9179 out = ucs1lib_split(
9180 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9181 break;
9182 case PyUnicode_2BYTE_KIND:
9183 out = ucs2lib_split(
9184 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9185 break;
9186 case PyUnicode_4BYTE_KIND:
9187 out = ucs4lib_split(
9188 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9189 break;
9190 default:
9191 out = NULL;
9192 }
9193 if (kind1 != kind)
9194 PyMem_Free(buf1);
9195 if (kind2 != kind)
9196 PyMem_Free(buf2);
9197 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198}
9199
Alexander Belopolsky40018472011-02-26 01:02:56 +00009200static PyObject *
9201rsplit(PyUnicodeObject *self,
9202 PyUnicodeObject *substring,
9203 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 int kind1, kind2, kind;
9206 void *buf1, *buf2;
9207 Py_ssize_t len1, len2;
9208 PyObject* out;
9209
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009210 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009211 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 if (PyUnicode_READY(self) == -1)
9214 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 if (substring == NULL)
9217 switch(PyUnicode_KIND(self)) {
9218 case PyUnicode_1BYTE_KIND:
9219 return ucs1lib_rsplit_whitespace(
9220 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9221 PyUnicode_GET_LENGTH(self), maxcount
9222 );
9223 case PyUnicode_2BYTE_KIND:
9224 return ucs2lib_rsplit_whitespace(
9225 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9226 PyUnicode_GET_LENGTH(self), maxcount
9227 );
9228 case PyUnicode_4BYTE_KIND:
9229 return ucs4lib_rsplit_whitespace(
9230 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9231 PyUnicode_GET_LENGTH(self), maxcount
9232 );
9233 default:
9234 assert(0);
9235 return NULL;
9236 }
9237
9238 if (PyUnicode_READY(substring) == -1)
9239 return NULL;
9240
9241 kind1 = PyUnicode_KIND(self);
9242 kind2 = PyUnicode_KIND(substring);
9243 kind = kind1 > kind2 ? kind1 : kind2;
9244 buf1 = PyUnicode_DATA(self);
9245 buf2 = PyUnicode_DATA(substring);
9246 if (kind1 != kind)
9247 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9248 if (!buf1)
9249 return NULL;
9250 if (kind2 != kind)
9251 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9252 if (!buf2) {
9253 if (kind1 != kind) PyMem_Free(buf1);
9254 return NULL;
9255 }
9256 len1 = PyUnicode_GET_LENGTH(self);
9257 len2 = PyUnicode_GET_LENGTH(substring);
9258
9259 switch(kind) {
9260 case PyUnicode_1BYTE_KIND:
9261 out = ucs1lib_rsplit(
9262 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9263 break;
9264 case PyUnicode_2BYTE_KIND:
9265 out = ucs2lib_rsplit(
9266 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9267 break;
9268 case PyUnicode_4BYTE_KIND:
9269 out = ucs4lib_rsplit(
9270 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9271 break;
9272 default:
9273 out = NULL;
9274 }
9275 if (kind1 != kind)
9276 PyMem_Free(buf1);
9277 if (kind2 != kind)
9278 PyMem_Free(buf2);
9279 return out;
9280}
9281
9282static Py_ssize_t
9283anylib_find(int kind, void *buf1, Py_ssize_t len1,
9284 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9285{
9286 switch(kind) {
9287 case PyUnicode_1BYTE_KIND:
9288 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9289 case PyUnicode_2BYTE_KIND:
9290 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9291 case PyUnicode_4BYTE_KIND:
9292 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9293 }
9294 assert(0);
9295 return -1;
9296}
9297
9298static Py_ssize_t
9299anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9300 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9301{
9302 switch(kind) {
9303 case PyUnicode_1BYTE_KIND:
9304 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9305 case PyUnicode_2BYTE_KIND:
9306 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9307 case PyUnicode_4BYTE_KIND:
9308 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9309 }
9310 assert(0);
9311 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009312}
9313
Alexander Belopolsky40018472011-02-26 01:02:56 +00009314static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315replace(PyObject *self, PyObject *str1,
9316 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 PyObject *u;
9319 char *sbuf = PyUnicode_DATA(self);
9320 char *buf1 = PyUnicode_DATA(str1);
9321 char *buf2 = PyUnicode_DATA(str2);
9322 int srelease = 0, release1 = 0, release2 = 0;
9323 int skind = PyUnicode_KIND(self);
9324 int kind1 = PyUnicode_KIND(str1);
9325 int kind2 = PyUnicode_KIND(str2);
9326 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9327 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9328 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329
9330 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009333 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 if (skind < kind1)
9336 /* substring too wide to be present */
9337 goto nothing;
9338
9339 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009340 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009341 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009343 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 Py_UCS4 u1, u2, maxchar;
9347 int mayshrink, rkind;
9348 u1 = PyUnicode_READ_CHAR(str1, 0);
9349 if (!findchar(sbuf, PyUnicode_KIND(self),
9350 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009351 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 u2 = PyUnicode_READ_CHAR(str2, 0);
9353 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9354 /* Replacing u1 with u2 may cause a maxchar reduction in the
9355 result string. */
9356 mayshrink = maxchar > 127;
9357 if (u2 > maxchar) {
9358 maxchar = u2;
9359 mayshrink = 0;
9360 }
9361 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009364 if (PyUnicode_CopyCharacters(u, 0,
9365 (PyObject*)self, 0, slen) < 0)
9366 {
9367 Py_DECREF(u);
9368 return NULL;
9369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 rkind = PyUnicode_KIND(u);
9371 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9372 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009373 if (--maxcount < 0)
9374 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (mayshrink) {
9378 PyObject *tmp = u;
9379 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9380 PyUnicode_GET_LENGTH(tmp));
9381 Py_DECREF(tmp);
9382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 int rkind = skind;
9385 char *res;
9386 if (kind1 < rkind) {
9387 /* widen substring */
9388 buf1 = _PyUnicode_AsKind(str1, rkind);
9389 if (!buf1) goto error;
9390 release1 = 1;
9391 }
9392 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 if (i < 0)
9394 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (rkind > kind2) {
9396 /* widen replacement */
9397 buf2 = _PyUnicode_AsKind(str2, rkind);
9398 if (!buf2) goto error;
9399 release2 = 1;
9400 }
9401 else if (rkind < kind2) {
9402 /* widen self and buf1 */
9403 rkind = kind2;
9404 if (release1) PyMem_Free(buf1);
9405 sbuf = _PyUnicode_AsKind(self, rkind);
9406 if (!sbuf) goto error;
9407 srelease = 1;
9408 buf1 = _PyUnicode_AsKind(str1, rkind);
9409 if (!buf1) goto error;
9410 release1 = 1;
9411 }
9412 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9413 if (!res) {
9414 PyErr_NoMemory();
9415 goto error;
9416 }
9417 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009418 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9420 buf2,
9421 PyUnicode_KIND_SIZE(rkind, len2));
9422 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009423
9424 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9426 slen-i,
9427 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009428 if (i == -1)
9429 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9431 buf2,
9432 PyUnicode_KIND_SIZE(rkind, len2));
9433 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435
9436 u = PyUnicode_FromKindAndData(rkind, res, slen);
9437 PyMem_Free(res);
9438 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 Py_ssize_t n, i, j, ires;
9443 Py_ssize_t product, new_size;
9444 int rkind = skind;
9445 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 if (kind1 < rkind) {
9448 buf1 = _PyUnicode_AsKind(str1, rkind);
9449 if (!buf1) goto error;
9450 release1 = 1;
9451 }
9452 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009453 if (n == 0)
9454 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (kind2 < rkind) {
9456 buf2 = _PyUnicode_AsKind(str2, rkind);
9457 if (!buf2) goto error;
9458 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 else if (kind2 > rkind) {
9461 rkind = kind2;
9462 sbuf = _PyUnicode_AsKind(self, rkind);
9463 if (!sbuf) goto error;
9464 srelease = 1;
9465 if (release1) PyMem_Free(buf1);
9466 buf1 = _PyUnicode_AsKind(str1, rkind);
9467 if (!buf1) goto error;
9468 release1 = 1;
9469 }
9470 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9471 PyUnicode_GET_LENGTH(str1))); */
9472 product = n * (len2-len1);
9473 if ((product / (len2-len1)) != n) {
9474 PyErr_SetString(PyExc_OverflowError,
9475 "replace string is too long");
9476 goto error;
9477 }
9478 new_size = slen + product;
9479 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9480 PyErr_SetString(PyExc_OverflowError,
9481 "replace string is too long");
9482 goto error;
9483 }
9484 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9485 if (!res)
9486 goto error;
9487 ires = i = 0;
9488 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009489 while (n-- > 0) {
9490 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 j = anylib_find(rkind,
9492 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9493 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009494 if (j == -1)
9495 break;
9496 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009497 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9499 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9500 PyUnicode_KIND_SIZE(rkind, j-i));
9501 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009502 }
9503 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 if (len2 > 0) {
9505 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9506 buf2,
9507 PyUnicode_KIND_SIZE(rkind, len2));
9508 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009513 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9515 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9516 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009517 } else {
9518 /* interleave */
9519 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9521 buf2,
9522 PyUnicode_KIND_SIZE(rkind, len2));
9523 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524 if (--n <= 0)
9525 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9527 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9528 PyUnicode_KIND_SIZE(rkind, 1));
9529 ires++;
9530 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9533 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9534 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009537 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 if (srelease)
9540 PyMem_FREE(sbuf);
9541 if (release1)
9542 PyMem_FREE(buf1);
9543 if (release2)
9544 PyMem_FREE(buf2);
9545 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009546
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009548 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 if (srelease)
9550 PyMem_FREE(sbuf);
9551 if (release1)
9552 PyMem_FREE(buf1);
9553 if (release2)
9554 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009555 if (PyUnicode_CheckExact(self)) {
9556 Py_INCREF(self);
9557 return (PyObject *) self;
9558 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009559 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 error:
9561 if (srelease && sbuf)
9562 PyMem_FREE(sbuf);
9563 if (release1 && buf1)
9564 PyMem_FREE(buf1);
9565 if (release2 && buf2)
9566 PyMem_FREE(buf2);
9567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568}
9569
9570/* --- Unicode Object Methods --------------------------------------------- */
9571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009572PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574\n\
9575Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009576characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577
9578static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009579unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 return fixup(self, fixtitle);
9582}
9583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009584PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586\n\
9587Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009588have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589
9590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009591unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593 return fixup(self, fixcapitalize);
9594}
9595
9596#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009597PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599\n\
9600Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009601normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602
9603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009604unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
9606 PyObject *list;
9607 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009608 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610 /* Split into words */
9611 list = split(self, NULL, -1);
9612 if (!list)
9613 return NULL;
9614
9615 /* Capitalize each word */
9616 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9617 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619 if (item == NULL)
9620 goto onError;
9621 Py_DECREF(PyList_GET_ITEM(list, i));
9622 PyList_SET_ITEM(list, i, item);
9623 }
9624
9625 /* Join the words to form a new string */
9626 item = PyUnicode_Join(NULL, list);
9627
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629 Py_DECREF(list);
9630 return (PyObject *)item;
9631}
9632#endif
9633
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009634/* Argument converter. Coerces to a single unicode character */
9635
9636static int
9637convert_uc(PyObject *obj, void *addr)
9638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009641
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 uniobj = PyUnicode_FromObject(obj);
9643 if (uniobj == NULL) {
9644 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 return 0;
9647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009649 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009651 Py_DECREF(uniobj);
9652 return 0;
9653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 Py_DECREF(uniobj);
9656 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009657}
9658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009659PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009662Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009663done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
9665static PyObject *
9666unicode_center(PyUnicodeObject *self, PyObject *args)
9667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009668 Py_ssize_t marg, left;
9669 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 Py_UCS4 fillchar = ' ';
9671
Victor Stinnere9a29352011-10-01 02:14:59 +02009672 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674
Victor Stinnere9a29352011-10-01 02:14:59 +02009675 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676 return NULL;
9677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 Py_INCREF(self);
9680 return (PyObject*) self;
9681 }
9682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 left = marg / 2 + (marg & width & 1);
9685
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009686 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687}
9688
Marc-André Lemburge5034372000-08-08 08:04:29 +00009689#if 0
9690
9691/* This code should go into some future Unicode collation support
9692 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009693 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009694
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009695/* speedy UTF-16 code point order comparison */
9696/* gleaned from: */
9697/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9698
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009699static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009700{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009701 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009702 0, 0, 0, 0, 0, 0, 0, 0,
9703 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009704 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009705};
9706
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707static int
9708unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009710 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009711
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 Py_UNICODE *s1 = str1->str;
9713 Py_UNICODE *s2 = str2->str;
9714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 len1 = str1->_base._base.length;
9716 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009717
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009719 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009720
9721 c1 = *s1++;
9722 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009723
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 if (c1 > (1<<11) * 26)
9725 c1 += utf16Fixup[c1>>11];
9726 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009727 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009728 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009729
9730 if (c1 != c2)
9731 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009732
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009733 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 }
9735
9736 return (len1 < len2) ? -1 : (len1 != len2);
9737}
9738
Marc-André Lemburge5034372000-08-08 08:04:29 +00009739#else
9740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741/* This function assumes that str1 and str2 are readied by the caller. */
9742
Marc-André Lemburge5034372000-08-08 08:04:29 +00009743static int
9744unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 int kind1, kind2;
9747 void *data1, *data2;
9748 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 kind1 = PyUnicode_KIND(str1);
9751 kind2 = PyUnicode_KIND(str2);
9752 data1 = PyUnicode_DATA(str1);
9753 data2 = PyUnicode_DATA(str2);
9754 len1 = PyUnicode_GET_LENGTH(str1);
9755 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 for (i = 0; i < len1 && i < len2; ++i) {
9758 Py_UCS4 c1, c2;
9759 c1 = PyUnicode_READ(kind1, data1, i);
9760 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009761
9762 if (c1 != c2)
9763 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009764 }
9765
9766 return (len1 < len2) ? -1 : (len1 != len2);
9767}
9768
9769#endif
9770
Alexander Belopolsky40018472011-02-26 01:02:56 +00009771int
9772PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9775 if (PyUnicode_READY(left) == -1 ||
9776 PyUnicode_READY(right) == -1)
9777 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009778 return unicode_compare((PyUnicodeObject *)left,
9779 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009781 PyErr_Format(PyExc_TypeError,
9782 "Can't compare %.100s and %.100s",
9783 left->ob_type->tp_name,
9784 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 return -1;
9786}
9787
Martin v. Löwis5b222132007-06-10 09:51:05 +00009788int
9789PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 Py_ssize_t i;
9792 int kind;
9793 void *data;
9794 Py_UCS4 chr;
9795
Victor Stinner910337b2011-10-03 03:20:16 +02009796 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 if (PyUnicode_READY(uni) == -1)
9798 return -1;
9799 kind = PyUnicode_KIND(uni);
9800 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009801 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9803 if (chr != str[i])
9804 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009805 /* This check keeps Python strings that end in '\0' from comparing equal
9806 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009809 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009810 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009811 return 0;
9812}
9813
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009814
Benjamin Peterson29060642009-01-31 22:14:21 +00009815#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009816 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009817
Alexander Belopolsky40018472011-02-26 01:02:56 +00009818PyObject *
9819PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009820{
9821 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009822
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009823 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9824 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 if (PyUnicode_READY(left) == -1 ||
9826 PyUnicode_READY(right) == -1)
9827 return NULL;
9828 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9829 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009830 if (op == Py_EQ) {
9831 Py_INCREF(Py_False);
9832 return Py_False;
9833 }
9834 if (op == Py_NE) {
9835 Py_INCREF(Py_True);
9836 return Py_True;
9837 }
9838 }
9839 if (left == right)
9840 result = 0;
9841 else
9842 result = unicode_compare((PyUnicodeObject *)left,
9843 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009845 /* Convert the return value to a Boolean */
9846 switch (op) {
9847 case Py_EQ:
9848 v = TEST_COND(result == 0);
9849 break;
9850 case Py_NE:
9851 v = TEST_COND(result != 0);
9852 break;
9853 case Py_LE:
9854 v = TEST_COND(result <= 0);
9855 break;
9856 case Py_GE:
9857 v = TEST_COND(result >= 0);
9858 break;
9859 case Py_LT:
9860 v = TEST_COND(result == -1);
9861 break;
9862 case Py_GT:
9863 v = TEST_COND(result == 1);
9864 break;
9865 default:
9866 PyErr_BadArgument();
9867 return NULL;
9868 }
9869 Py_INCREF(v);
9870 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872
Brian Curtindfc80e32011-08-10 20:28:54 -05009873 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009874}
9875
Alexander Belopolsky40018472011-02-26 01:02:56 +00009876int
9877PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009878{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009879 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 int kind1, kind2, kind;
9881 void *buf1, *buf2;
9882 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009883 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009884
9885 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009886 sub = PyUnicode_FromObject(element);
9887 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 PyErr_Format(PyExc_TypeError,
9889 "'in <string>' requires string as left operand, not %s",
9890 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009891 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 if (PyUnicode_READY(sub) == -1)
9894 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009895
Thomas Wouters477c8d52006-05-27 19:21:47 +00009896 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009897 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009898 Py_DECREF(sub);
9899 return -1;
9900 }
9901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 kind1 = PyUnicode_KIND(str);
9903 kind2 = PyUnicode_KIND(sub);
9904 kind = kind1 > kind2 ? kind1 : kind2;
9905 buf1 = PyUnicode_DATA(str);
9906 buf2 = PyUnicode_DATA(sub);
9907 if (kind1 != kind)
9908 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9909 if (!buf1) {
9910 Py_DECREF(sub);
9911 return -1;
9912 }
9913 if (kind2 != kind)
9914 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9915 if (!buf2) {
9916 Py_DECREF(sub);
9917 if (kind1 != kind) PyMem_Free(buf1);
9918 return -1;
9919 }
9920 len1 = PyUnicode_GET_LENGTH(str);
9921 len2 = PyUnicode_GET_LENGTH(sub);
9922
9923 switch(kind) {
9924 case PyUnicode_1BYTE_KIND:
9925 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9926 break;
9927 case PyUnicode_2BYTE_KIND:
9928 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9929 break;
9930 case PyUnicode_4BYTE_KIND:
9931 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9932 break;
9933 default:
9934 result = -1;
9935 assert(0);
9936 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009937
9938 Py_DECREF(str);
9939 Py_DECREF(sub);
9940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (kind1 != kind)
9942 PyMem_Free(buf1);
9943 if (kind2 != kind)
9944 PyMem_Free(buf2);
9945
Guido van Rossum403d68b2000-03-13 15:55:09 +00009946 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009947}
9948
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949/* Concat to string or Unicode object giving a new Unicode object. */
9950
Alexander Belopolsky40018472011-02-26 01:02:56 +00009951PyObject *
9952PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 PyObject *u = NULL, *v = NULL, *w;
9955 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956
9957 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009960 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964
9965 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009966 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009970 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 }
9974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009976 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 w = PyUnicode_New(
9980 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9981 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009984 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9985 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009986 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009987 v, 0,
9988 PyUnicode_GET_LENGTH(v)) < 0)
9989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 Py_DECREF(u);
9991 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 Py_XDECREF(u);
9996 Py_XDECREF(v);
9997 return NULL;
9998}
9999
Victor Stinnerb0923652011-10-04 01:17:31 +020010000static void
10001unicode_append_inplace(PyObject **p_left, PyObject *right)
10002{
10003 Py_ssize_t left_len, right_len, new_len;
10004#ifdef Py_DEBUG
10005 Py_ssize_t copied;
10006#endif
10007
10008 assert(PyUnicode_IS_READY(*p_left));
10009 assert(PyUnicode_IS_READY(right));
10010
10011 left_len = PyUnicode_GET_LENGTH(*p_left);
10012 right_len = PyUnicode_GET_LENGTH(right);
10013 if (left_len > PY_SSIZE_T_MAX - right_len) {
10014 PyErr_SetString(PyExc_OverflowError,
10015 "strings are too large to concat");
10016 goto error;
10017 }
10018 new_len = left_len + right_len;
10019
10020 /* Now we own the last reference to 'left', so we can resize it
10021 * in-place.
10022 */
10023 if (unicode_resize(p_left, new_len) != 0) {
10024 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10025 * deallocated so it cannot be put back into
10026 * 'variable'. The MemoryError is raised when there
10027 * is no value in 'variable', which might (very
10028 * remotely) be a cause of incompatibilities.
10029 */
10030 goto error;
10031 }
10032 /* copy 'right' into the newly allocated area of 'left' */
10033#ifdef Py_DEBUG
10034 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10035 right, 0,
10036 right_len);
10037 assert(0 <= copied);
10038#else
10039 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10040#endif
10041 return;
10042
10043error:
10044 Py_DECREF(*p_left);
10045 *p_left = NULL;
10046}
10047
Walter Dörwald1ab83302007-05-18 17:15:44 +000010048void
Victor Stinner23e56682011-10-03 03:54:37 +020010049PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010050{
Victor Stinner23e56682011-10-03 03:54:37 +020010051 PyObject *left, *res;
10052
10053 if (p_left == NULL) {
10054 if (!PyErr_Occurred())
10055 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010056 return;
10057 }
Victor Stinner23e56682011-10-03 03:54:37 +020010058 left = *p_left;
10059 if (right == NULL || !PyUnicode_Check(left)) {
10060 if (!PyErr_Occurred())
10061 PyErr_BadInternalCall();
10062 goto error;
10063 }
10064
10065 if (PyUnicode_CheckExact(left) && left != unicode_empty
10066 && PyUnicode_CheckExact(right) && right != unicode_empty
10067 && unicode_resizable(left)
10068 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10069 || _PyUnicode_WSTR(left) != NULL))
10070 {
Victor Stinner23e56682011-10-03 03:54:37 +020010071 if (PyUnicode_READY(left))
10072 goto error;
10073 if (PyUnicode_READY(right))
10074 goto error;
10075
Victor Stinnerb0923652011-10-04 01:17:31 +020010076 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10077 to change the structure size, but characters are stored just after
10078 the structure, and so it requires to move all charactres which is
10079 not so different than duplicating the string. */
10080 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010081 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010082 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010083 return;
10084 }
10085 }
10086
10087 res = PyUnicode_Concat(left, right);
10088 if (res == NULL)
10089 goto error;
10090 Py_DECREF(left);
10091 *p_left = res;
10092 return;
10093
10094error:
10095 Py_DECREF(*p_left);
10096 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010097}
10098
10099void
10100PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10101{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010102 PyUnicode_Append(pleft, right);
10103 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010104}
10105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010106PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010109Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010110string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010111interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112
10113static PyObject *
10114unicode_count(PyUnicodeObject *self, PyObject *args)
10115{
10116 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010117 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010118 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 int kind1, kind2, kind;
10121 void *buf1, *buf2;
10122 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123
Jesus Ceaac451502011-04-20 17:09:23 +020010124 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10125 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 kind1 = PyUnicode_KIND(self);
10129 kind2 = PyUnicode_KIND(substring);
10130 kind = kind1 > kind2 ? kind1 : kind2;
10131 buf1 = PyUnicode_DATA(self);
10132 buf2 = PyUnicode_DATA(substring);
10133 if (kind1 != kind)
10134 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10135 if (!buf1) {
10136 Py_DECREF(substring);
10137 return NULL;
10138 }
10139 if (kind2 != kind)
10140 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10141 if (!buf2) {
10142 Py_DECREF(substring);
10143 if (kind1 != kind) PyMem_Free(buf1);
10144 return NULL;
10145 }
10146 len1 = PyUnicode_GET_LENGTH(self);
10147 len2 = PyUnicode_GET_LENGTH(substring);
10148
10149 ADJUST_INDICES(start, end, len1);
10150 switch(kind) {
10151 case PyUnicode_1BYTE_KIND:
10152 iresult = ucs1lib_count(
10153 ((Py_UCS1*)buf1) + start, end - start,
10154 buf2, len2, PY_SSIZE_T_MAX
10155 );
10156 break;
10157 case PyUnicode_2BYTE_KIND:
10158 iresult = ucs2lib_count(
10159 ((Py_UCS2*)buf1) + start, end - start,
10160 buf2, len2, PY_SSIZE_T_MAX
10161 );
10162 break;
10163 case PyUnicode_4BYTE_KIND:
10164 iresult = ucs4lib_count(
10165 ((Py_UCS4*)buf1) + start, end - start,
10166 buf2, len2, PY_SSIZE_T_MAX
10167 );
10168 break;
10169 default:
10170 assert(0); iresult = 0;
10171 }
10172
10173 result = PyLong_FromSsize_t(iresult);
10174
10175 if (kind1 != kind)
10176 PyMem_Free(buf1);
10177 if (kind2 != kind)
10178 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
10180 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 return result;
10183}
10184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010185PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010186 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010188Encode S using the codec registered for encoding. Default encoding\n\
10189is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010190handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010191a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10192'xmlcharrefreplace' as well as any other name registered with\n\
10193codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194
10195static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010196unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010198 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 char *encoding = NULL;
10200 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010201
Benjamin Peterson308d6372009-09-18 21:42:35 +000010202 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10203 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010205 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010206}
10207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010208PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210\n\
10211Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010212If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
10214static PyObject*
10215unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10216{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010217 Py_ssize_t i, j, line_pos, src_len, incr;
10218 Py_UCS4 ch;
10219 PyObject *u;
10220 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010222 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010223 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
Antoine Pitrou22425222011-10-04 19:10:51 +020010228 if (PyUnicode_READY(self) == -1)
10229 return NULL;
10230
Thomas Wouters7e474022000-07-16 12:04:32 +000010231 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010232 src_len = PyUnicode_GET_LENGTH(self);
10233 i = j = line_pos = 0;
10234 kind = PyUnicode_KIND(self);
10235 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010236 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010237 for (; i < src_len; i++) {
10238 ch = PyUnicode_READ(kind, src_data, i);
10239 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010240 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010241 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010242 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010243 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010244 goto overflow;
10245 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010247 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010251 goto overflow;
10252 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010254 if (ch == '\n' || ch == '\r')
10255 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010257 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010258 if (!found && PyUnicode_CheckExact(self)) {
10259 Py_INCREF((PyObject *) self);
10260 return (PyObject *) self;
10261 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010262
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010264 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265 if (!u)
10266 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010267 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
Antoine Pitroue71d5742011-10-04 15:55:09 +020010269 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
Antoine Pitroue71d5742011-10-04 15:55:09 +020010271 for (; i < src_len; i++) {
10272 ch = PyUnicode_READ(kind, src_data, i);
10273 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010275 incr = tabsize - (line_pos % tabsize);
10276 line_pos += incr;
10277 while (incr--) {
10278 PyUnicode_WRITE(kind, dest_data, j, ' ');
10279 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010280 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010284 line_pos++;
10285 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010286 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010287 if (ch == '\n' || ch == '\r')
10288 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010290 }
10291 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010292#ifndef DONT_MAKE_RESULT_READY
10293 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 Py_DECREF(u);
10295 return NULL;
10296 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010297#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010299
Antoine Pitroue71d5742011-10-04 15:55:09 +020010300 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010301 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010305PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307\n\
10308Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010309such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310arguments start and end are interpreted as in slice notation.\n\
10311\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Jesus Ceaac451502011-04-20 17:09:23 +020010317 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010318 Py_ssize_t start;
10319 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010320 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
Jesus Ceaac451502011-04-20 17:09:23 +020010322 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10323 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (PyUnicode_READY(self) == -1)
10327 return NULL;
10328 if (PyUnicode_READY(substring) == -1)
10329 return NULL;
10330
10331 result = any_find_slice(
10332 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10333 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335
10336 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (result == -2)
10339 return NULL;
10340
Christian Heimes217cfd12007-12-02 14:31:20 +000010341 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342}
10343
10344static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010345unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010347 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10348 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351}
10352
Guido van Rossumc2504932007-09-18 19:42:40 +000010353/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010354 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010355static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010356unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357{
Guido van Rossumc2504932007-09-18 19:42:40 +000010358 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010359 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (_PyUnicode_HASH(self) != -1)
10362 return _PyUnicode_HASH(self);
10363 if (PyUnicode_READY(self) == -1)
10364 return -1;
10365 len = PyUnicode_GET_LENGTH(self);
10366
10367 /* The hash function as a macro, gets expanded three times below. */
10368#define HASH(P) \
10369 x = (Py_uhash_t)*P << 7; \
10370 while (--len >= 0) \
10371 x = (1000003*x) ^ (Py_uhash_t)*P++;
10372
10373 switch (PyUnicode_KIND(self)) {
10374 case PyUnicode_1BYTE_KIND: {
10375 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10376 HASH(c);
10377 break;
10378 }
10379 case PyUnicode_2BYTE_KIND: {
10380 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10381 HASH(s);
10382 break;
10383 }
10384 default: {
10385 Py_UCS4 *l;
10386 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10387 "Impossible switch case in unicode_hash");
10388 l = PyUnicode_4BYTE_DATA(self);
10389 HASH(l);
10390 break;
10391 }
10392 }
10393 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10394
Guido van Rossumc2504932007-09-18 19:42:40 +000010395 if (x == -1)
10396 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010398 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010402PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010405Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
10407static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010410 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010411 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010412 Py_ssize_t start;
10413 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414
Jesus Ceaac451502011-04-20 17:09:23 +020010415 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10416 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (PyUnicode_READY(self) == -1)
10420 return NULL;
10421 if (PyUnicode_READY(substring) == -1)
10422 return NULL;
10423
10424 result = any_find_slice(
10425 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10426 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 if (result == -2)
10432 return NULL;
10433
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434 if (result < 0) {
10435 PyErr_SetString(PyExc_ValueError, "substring not found");
10436 return NULL;
10437 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438
Christian Heimes217cfd12007-12-02 14:31:20 +000010439 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440}
10441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010442PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010443 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010445Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010446at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447
10448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010449unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 Py_ssize_t i, length;
10452 int kind;
10453 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454 int cased;
10455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (PyUnicode_READY(self) == -1)
10457 return NULL;
10458 length = PyUnicode_GET_LENGTH(self);
10459 kind = PyUnicode_KIND(self);
10460 data = PyUnicode_DATA(self);
10461
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 if (length == 1)
10464 return PyBool_FromLong(
10465 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010470
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 for (i = 0; i < length; i++) {
10473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010474
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10476 return PyBool_FromLong(0);
10477 else if (!cased && Py_UNICODE_ISLOWER(ch))
10478 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010480 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481}
10482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010483PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010486Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010487at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
10489static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010490unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 Py_ssize_t i, length;
10493 int kind;
10494 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 int cased;
10496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (PyUnicode_READY(self) == -1)
10498 return NULL;
10499 length = PyUnicode_GET_LENGTH(self);
10500 kind = PyUnicode_KIND(self);
10501 data = PyUnicode_DATA(self);
10502
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (length == 1)
10505 return PyBool_FromLong(
10506 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010511
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 for (i = 0; i < length; i++) {
10514 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010515
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10517 return PyBool_FromLong(0);
10518 else if (!cased && Py_UNICODE_ISUPPER(ch))
10519 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010521 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010524PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010525 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010527Return True if S is a titlecased string and there is at least one\n\
10528character in S, i.e. upper- and titlecase characters may only\n\
10529follow uncased characters and lowercase characters only cased ones.\n\
10530Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
10532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010533unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 Py_ssize_t i, length;
10536 int kind;
10537 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 int cased, previous_is_cased;
10539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
10542 length = PyUnicode_GET_LENGTH(self);
10543 kind = PyUnicode_KIND(self);
10544 data = PyUnicode_DATA(self);
10545
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (length == 1) {
10548 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10549 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10550 (Py_UNICODE_ISUPPER(ch) != 0));
10551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010556
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 cased = 0;
10558 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 for (i = 0; i < length; i++) {
10560 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010561
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10563 if (previous_is_cased)
10564 return PyBool_FromLong(0);
10565 previous_is_cased = 1;
10566 cased = 1;
10567 }
10568 else if (Py_UNICODE_ISLOWER(ch)) {
10569 if (!previous_is_cased)
10570 return PyBool_FromLong(0);
10571 previous_is_cased = 1;
10572 cased = 1;
10573 }
10574 else
10575 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010577 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578}
10579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010580PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010583Return True if all characters in S are whitespace\n\
10584and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
10586static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010587unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 Py_ssize_t i, length;
10590 int kind;
10591 void *data;
10592
10593 if (PyUnicode_READY(self) == -1)
10594 return NULL;
10595 length = PyUnicode_GET_LENGTH(self);
10596 kind = PyUnicode_KIND(self);
10597 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 if (length == 1)
10601 return PyBool_FromLong(
10602 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010604 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 for (i = 0; i < length; i++) {
10609 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010610 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614}
10615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010616PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010618\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010619Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010620and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010621
10622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010623unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 Py_ssize_t i, length;
10626 int kind;
10627 void *data;
10628
10629 if (PyUnicode_READY(self) == -1)
10630 return NULL;
10631 length = PyUnicode_GET_LENGTH(self);
10632 kind = PyUnicode_KIND(self);
10633 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010634
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010635 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (length == 1)
10637 return PyBool_FromLong(
10638 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010639
10640 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 for (i = 0; i < length; i++) {
10645 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010648 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010649}
10650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010651PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010653\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010654Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010655and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010656
10657static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010658unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010659{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 int kind;
10661 void *data;
10662 Py_ssize_t len, i;
10663
10664 if (PyUnicode_READY(self) == -1)
10665 return NULL;
10666
10667 kind = PyUnicode_KIND(self);
10668 data = PyUnicode_DATA(self);
10669 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010670
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010671 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (len == 1) {
10673 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10674 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10675 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010676
10677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010679 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 for (i = 0; i < len; i++) {
10682 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010683 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010684 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010686 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010687}
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010692Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010696unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 Py_ssize_t i, length;
10699 int kind;
10700 void *data;
10701
10702 if (PyUnicode_READY(self) == -1)
10703 return NULL;
10704 length = PyUnicode_GET_LENGTH(self);
10705 kind = PyUnicode_KIND(self);
10706 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (length == 1)
10710 return PyBool_FromLong(
10711 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010713 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 for (i = 0; i < length; i++) {
10718 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010721 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010727Return True if all characters in S are digits\n\
10728and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010731unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 Py_ssize_t i, length;
10734 int kind;
10735 void *data;
10736
10737 if (PyUnicode_READY(self) == -1)
10738 return NULL;
10739 length = PyUnicode_GET_LENGTH(self);
10740 kind = PyUnicode_KIND(self);
10741 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (length == 1) {
10745 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10746 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 for (i = 0; i < length; i++) {
10754 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010757 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758}
10759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010760PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010763Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010764False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
10766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010767unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 Py_ssize_t i, length;
10770 int kind;
10771 void *data;
10772
10773 if (PyUnicode_READY(self) == -1)
10774 return NULL;
10775 length = PyUnicode_GET_LENGTH(self);
10776 kind = PyUnicode_KIND(self);
10777 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (length == 1)
10781 return PyBool_FromLong(
10782 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010784 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 for (i = 0; i < length; i++) {
10789 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793}
10794
Martin v. Löwis47383402007-08-15 07:32:56 +000010795int
10796PyUnicode_IsIdentifier(PyObject *self)
10797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 int kind;
10799 void *data;
10800 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010801 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if (PyUnicode_READY(self) == -1) {
10804 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010805 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 }
10807
10808 /* Special case for empty strings */
10809 if (PyUnicode_GET_LENGTH(self) == 0)
10810 return 0;
10811 kind = PyUnicode_KIND(self);
10812 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010813
10814 /* PEP 3131 says that the first character must be in
10815 XID_Start and subsequent characters in XID_Continue,
10816 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010818 letters, digits, underscore). However, given the current
10819 definition of XID_Start and XID_Continue, it is sufficient
10820 to check just for these, except that _ must be allowed
10821 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010823 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010824 return 0;
10825
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010826 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010829 return 1;
10830}
10831
10832PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010834\n\
10835Return True if S is a valid identifier according\n\
10836to the language definition.");
10837
10838static PyObject*
10839unicode_isidentifier(PyObject *self)
10840{
10841 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10842}
10843
Georg Brandl559e5d72008-06-11 18:37:52 +000010844PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010846\n\
10847Return True if all characters in S are considered\n\
10848printable in repr() or S is empty, False otherwise.");
10849
10850static PyObject*
10851unicode_isprintable(PyObject *self)
10852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 Py_ssize_t i, length;
10854 int kind;
10855 void *data;
10856
10857 if (PyUnicode_READY(self) == -1)
10858 return NULL;
10859 length = PyUnicode_GET_LENGTH(self);
10860 kind = PyUnicode_KIND(self);
10861 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010862
10863 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (length == 1)
10865 return PyBool_FromLong(
10866 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 for (i = 0; i < length; i++) {
10869 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010870 Py_RETURN_FALSE;
10871 }
10872 }
10873 Py_RETURN_TRUE;
10874}
10875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010877 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878\n\
10879Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010880iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881
10882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010883unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010885 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886}
10887
Martin v. Löwis18e16552006-02-15 17:27:45 +000010888static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889unicode_length(PyUnicodeObject *self)
10890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(self) == -1)
10892 return -1;
10893 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894}
10895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010896PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010899Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010900done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
10902static PyObject *
10903unicode_ljust(PyUnicodeObject *self, PyObject *args)
10904{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010905 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 Py_UCS4 fillchar = ' ';
10907
10908 if (PyUnicode_READY(self) == -1)
10909 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010910
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010911 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912 return NULL;
10913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 Py_INCREF(self);
10916 return (PyObject*) self;
10917 }
10918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920}
10921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010922PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010925Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926
10927static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010928unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930 return fixup(self, fixlower);
10931}
10932
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010933#define LEFTSTRIP 0
10934#define RIGHTSTRIP 1
10935#define BOTHSTRIP 2
10936
10937/* Arrays indexed by above */
10938static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10939
10940#define STRIPNAME(i) (stripformat[i]+3)
10941
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942/* externally visible for str.strip(unicode) */
10943PyObject *
10944_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 void *data;
10947 int kind;
10948 Py_ssize_t i, j, len;
10949 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10952 return NULL;
10953
10954 kind = PyUnicode_KIND(self);
10955 data = PyUnicode_DATA(self);
10956 len = PyUnicode_GET_LENGTH(self);
10957 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10958 PyUnicode_DATA(sepobj),
10959 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960
Benjamin Peterson14339b62009-01-31 16:36:08 +000010961 i = 0;
10962 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 while (i < len &&
10964 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010965 i++;
10966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010967 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010968
Benjamin Peterson14339b62009-01-31 16:36:08 +000010969 j = len;
10970 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 do {
10972 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 } while (j >= i &&
10974 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010976 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010977
Victor Stinner12bab6d2011-10-01 01:53:49 +020010978 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979}
10980
10981PyObject*
10982PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10983{
10984 unsigned char *data;
10985 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010986 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987
Victor Stinnerde636f32011-10-01 03:55:54 +020010988 if (PyUnicode_READY(self) == -1)
10989 return NULL;
10990
10991 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10992
Victor Stinner12bab6d2011-10-01 01:53:49 +020010993 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010995 if (PyUnicode_CheckExact(self)) {
10996 Py_INCREF(self);
10997 return self;
10998 }
10999 else
11000 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 }
11002
Victor Stinner12bab6d2011-10-01 01:53:49 +020011003 length = end - start;
11004 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011005 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006
Victor Stinnerde636f32011-10-01 03:55:54 +020011007 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011008 PyErr_SetString(PyExc_IndexError, "string index out of range");
11009 return NULL;
11010 }
11011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 kind = PyUnicode_KIND(self);
11013 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020011014 return PyUnicode_FromKindAndData(kind,
11015 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020011016 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
11019static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011020do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 int kind;
11023 void *data;
11024 Py_ssize_t len, i, j;
11025
11026 if (PyUnicode_READY(self) == -1)
11027 return NULL;
11028
11029 kind = PyUnicode_KIND(self);
11030 data = PyUnicode_DATA(self);
11031 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011032
Benjamin Peterson14339b62009-01-31 16:36:08 +000011033 i = 0;
11034 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011036 i++;
11037 }
11038 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011039
Benjamin Peterson14339b62009-01-31 16:36:08 +000011040 j = len;
11041 if (striptype != LEFTSTRIP) {
11042 do {
11043 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011045 j++;
11046 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011047
Victor Stinner12bab6d2011-10-01 01:53:49 +020011048 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049}
11050
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011051
11052static PyObject *
11053do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11054{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011055 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011056
Benjamin Peterson14339b62009-01-31 16:36:08 +000011057 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11058 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011059
Benjamin Peterson14339b62009-01-31 16:36:08 +000011060 if (sep != NULL && sep != Py_None) {
11061 if (PyUnicode_Check(sep))
11062 return _PyUnicode_XStrip(self, striptype, sep);
11063 else {
11064 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011065 "%s arg must be None or str",
11066 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011067 return NULL;
11068 }
11069 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011070
Benjamin Peterson14339b62009-01-31 16:36:08 +000011071 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011072}
11073
11074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011077\n\
11078Return a copy of the string S with leading and trailing\n\
11079whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011080If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011081
11082static PyObject *
11083unicode_strip(PyUnicodeObject *self, PyObject *args)
11084{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011085 if (PyTuple_GET_SIZE(args) == 0)
11086 return do_strip(self, BOTHSTRIP); /* Common case */
11087 else
11088 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011089}
11090
11091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011092PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011094\n\
11095Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011096If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011097
11098static PyObject *
11099unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011101 if (PyTuple_GET_SIZE(args) == 0)
11102 return do_strip(self, LEFTSTRIP); /* Common case */
11103 else
11104 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011105}
11106
11107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011110\n\
11111Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011112If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011113
11114static PyObject *
11115unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011117 if (PyTuple_GET_SIZE(args) == 0)
11118 return do_strip(self, RIGHTSTRIP); /* Common case */
11119 else
11120 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011121}
11122
11123
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011125unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126{
11127 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Georg Brandl222de0f2009-04-12 12:01:50 +000011130 if (len < 1) {
11131 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011132 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Tim Peters7a29bd52001-09-12 03:03:31 +000011135 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136 /* no repeat, return original string */
11137 Py_INCREF(str);
11138 return (PyObject*) str;
11139 }
Tim Peters8f422462000-09-09 06:13:41 +000011140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (PyUnicode_READY(str) == -1)
11142 return NULL;
11143
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011144 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011145 PyErr_SetString(PyExc_OverflowError,
11146 "repeated string is too long");
11147 return NULL;
11148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 if (!u)
11153 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011154 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 if (PyUnicode_GET_LENGTH(str) == 1) {
11157 const int kind = PyUnicode_KIND(str);
11158 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11159 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011160 if (kind == PyUnicode_1BYTE_KIND)
11161 memset(to, (unsigned char)fill_char, len);
11162 else {
11163 for (n = 0; n < len; ++n)
11164 PyUnicode_WRITE(kind, to, n, fill_char);
11165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 }
11167 else {
11168 /* number of characters copied this far */
11169 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11170 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11171 char *to = (char *) PyUnicode_DATA(u);
11172 Py_MEMCPY(to, PyUnicode_DATA(str),
11173 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 n = (done <= nchars-done) ? done : nchars-done;
11176 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011177 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 }
11180
11181 return (PyObject*) u;
11182}
11183
Alexander Belopolsky40018472011-02-26 01:02:56 +000011184PyObject *
11185PyUnicode_Replace(PyObject *obj,
11186 PyObject *subobj,
11187 PyObject *replobj,
11188 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189{
11190 PyObject *self;
11191 PyObject *str1;
11192 PyObject *str2;
11193 PyObject *result;
11194
11195 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011196 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011199 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 Py_DECREF(self);
11201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 }
11203 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011204 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 Py_DECREF(self);
11206 Py_DECREF(str1);
11207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 Py_DECREF(self);
11211 Py_DECREF(str1);
11212 Py_DECREF(str2);
11213 return result;
11214}
11215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011217 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218\n\
11219Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011220old replaced by new. If the optional argument count is\n\
11221given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 PyObject *str1;
11227 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011228 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 PyObject *result;
11230
Martin v. Löwis18e16552006-02-15 17:27:45 +000011231 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 str1 = PyUnicode_FromObject(str1);
11236 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11237 return NULL;
11238 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011239 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 Py_DECREF(str1);
11241 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
11244 result = replace(self, str1, str2, maxcount);
11245
11246 Py_DECREF(str1);
11247 Py_DECREF(str2);
11248 return result;
11249}
11250
Alexander Belopolsky40018472011-02-26 01:02:56 +000011251static PyObject *
11252unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011254 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 Py_ssize_t isize;
11256 Py_ssize_t osize, squote, dquote, i, o;
11257 Py_UCS4 max, quote;
11258 int ikind, okind;
11259 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011262 return NULL;
11263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 isize = PyUnicode_GET_LENGTH(unicode);
11265 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 /* Compute length of output, quote characters, and
11268 maximum character */
11269 osize = 2; /* quotes */
11270 max = 127;
11271 squote = dquote = 0;
11272 ikind = PyUnicode_KIND(unicode);
11273 for (i = 0; i < isize; i++) {
11274 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11275 switch (ch) {
11276 case '\'': squote++; osize++; break;
11277 case '"': dquote++; osize++; break;
11278 case '\\': case '\t': case '\r': case '\n':
11279 osize += 2; break;
11280 default:
11281 /* Fast-path ASCII */
11282 if (ch < ' ' || ch == 0x7f)
11283 osize += 4; /* \xHH */
11284 else if (ch < 0x7f)
11285 osize++;
11286 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11287 osize++;
11288 max = ch > max ? ch : max;
11289 }
11290 else if (ch < 0x100)
11291 osize += 4; /* \xHH */
11292 else if (ch < 0x10000)
11293 osize += 6; /* \uHHHH */
11294 else
11295 osize += 10; /* \uHHHHHHHH */
11296 }
11297 }
11298
11299 quote = '\'';
11300 if (squote) {
11301 if (dquote)
11302 /* Both squote and dquote present. Use squote,
11303 and escape them */
11304 osize += squote;
11305 else
11306 quote = '"';
11307 }
11308
11309 repr = PyUnicode_New(osize, max);
11310 if (repr == NULL)
11311 return NULL;
11312 okind = PyUnicode_KIND(repr);
11313 odata = PyUnicode_DATA(repr);
11314
11315 PyUnicode_WRITE(okind, odata, 0, quote);
11316 PyUnicode_WRITE(okind, odata, osize-1, quote);
11317
11318 for (i = 0, o = 1; i < isize; i++) {
11319 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011320
11321 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if ((ch == quote) || (ch == '\\')) {
11323 PyUnicode_WRITE(okind, odata, o++, '\\');
11324 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011325 continue;
11326 }
11327
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011329 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 PyUnicode_WRITE(okind, odata, o++, '\\');
11331 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011332 }
11333 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 PyUnicode_WRITE(okind, odata, o++, '\\');
11335 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011336 }
11337 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 PyUnicode_WRITE(okind, odata, o++, '\\');
11339 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011340 }
11341
11342 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011343 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 PyUnicode_WRITE(okind, odata, o++, '\\');
11345 PyUnicode_WRITE(okind, odata, o++, 'x');
11346 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11347 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011348 }
11349
Georg Brandl559e5d72008-06-11 18:37:52 +000011350 /* Copy ASCII characters as-is */
11351 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011353 }
11354
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011356 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011357 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011358 (categories Z* and C* except ASCII space)
11359 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011361 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (ch <= 0xff) {
11363 PyUnicode_WRITE(okind, odata, o++, '\\');
11364 PyUnicode_WRITE(okind, odata, o++, 'x');
11365 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11366 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011367 }
11368 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 else if (ch >= 0x10000) {
11370 PyUnicode_WRITE(okind, odata, o++, '\\');
11371 PyUnicode_WRITE(okind, odata, o++, 'U');
11372 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11373 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11374 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11375 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11376 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11377 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11378 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11379 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011380 }
11381 /* Map 16-bit characters to '\uxxxx' */
11382 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 PyUnicode_WRITE(okind, odata, o++, '\\');
11384 PyUnicode_WRITE(okind, odata, o++, 'u');
11385 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11386 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11387 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11388 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011389 }
11390 }
11391 /* Copy characters as-is */
11392 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011394 }
11395 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011398 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
11404Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011405such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406arguments start and end are interpreted as in slice notation.\n\
11407\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011408Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
11410static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412{
Jesus Ceaac451502011-04-20 17:09:23 +020011413 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011414 Py_ssize_t start;
11415 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011416 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
Jesus Ceaac451502011-04-20 17:09:23 +020011418 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11419 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (PyUnicode_READY(self) == -1)
11423 return NULL;
11424 if (PyUnicode_READY(substring) == -1)
11425 return NULL;
11426
11427 result = any_find_slice(
11428 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11429 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011430 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
11432 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (result == -2)
11435 return NULL;
11436
Christian Heimes217cfd12007-12-02 14:31:20 +000011437 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438}
11439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
11445static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Jesus Ceaac451502011-04-20 17:09:23 +020011448 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011449 Py_ssize_t start;
11450 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011451 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
Jesus Ceaac451502011-04-20 17:09:23 +020011453 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11454 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (PyUnicode_READY(self) == -1)
11458 return NULL;
11459 if (PyUnicode_READY(substring) == -1)
11460 return NULL;
11461
11462 result = any_find_slice(
11463 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11464 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011465 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
11467 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (result == -2)
11470 return NULL;
11471
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 if (result < 0) {
11473 PyErr_SetString(PyExc_ValueError, "substring not found");
11474 return NULL;
11475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476
Christian Heimes217cfd12007-12-02 14:31:20 +000011477 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011483Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011484done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
11486static PyObject *
11487unicode_rjust(PyUnicodeObject *self, PyObject *args)
11488{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011489 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 Py_UCS4 fillchar = ' ';
11491
Victor Stinnere9a29352011-10-01 02:14:59 +020011492 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011494
Victor Stinnere9a29352011-10-01 02:14:59 +020011495 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 return NULL;
11497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 Py_INCREF(self);
11500 return (PyObject*) self;
11501 }
11502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504}
11505
Alexander Belopolsky40018472011-02-26 01:02:56 +000011506PyObject *
11507PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508{
11509 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011510
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 s = PyUnicode_FromObject(s);
11512 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011513 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 if (sep != NULL) {
11515 sep = PyUnicode_FromObject(sep);
11516 if (sep == NULL) {
11517 Py_DECREF(s);
11518 return NULL;
11519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 }
11521
11522 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11523
11524 Py_DECREF(s);
11525 Py_XDECREF(sep);
11526 return result;
11527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
11532Return a list of the words in S, using sep as the\n\
11533delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011534splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011535whitespace string is a separator and empty strings are\n\
11536removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject*
11539unicode_split(PyUnicodeObject *self, PyObject *args)
11540{
11541 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011542 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Martin v. Löwis18e16552006-02-15 17:27:45 +000011544 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 return NULL;
11546
11547 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553}
11554
Thomas Wouters477c8d52006-05-27 19:21:47 +000011555PyObject *
11556PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11557{
11558 PyObject* str_obj;
11559 PyObject* sep_obj;
11560 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 int kind1, kind2, kind;
11562 void *buf1 = NULL, *buf2 = NULL;
11563 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011564
11565 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011566 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011568 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011570 Py_DECREF(str_obj);
11571 return NULL;
11572 }
11573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 kind1 = PyUnicode_KIND(str_in);
11575 kind2 = PyUnicode_KIND(sep_obj);
11576 kind = kind1 > kind2 ? kind1 : kind2;
11577 buf1 = PyUnicode_DATA(str_in);
11578 if (kind1 != kind)
11579 buf1 = _PyUnicode_AsKind(str_in, kind);
11580 if (!buf1)
11581 goto onError;
11582 buf2 = PyUnicode_DATA(sep_obj);
11583 if (kind2 != kind)
11584 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11585 if (!buf2)
11586 goto onError;
11587 len1 = PyUnicode_GET_LENGTH(str_obj);
11588 len2 = PyUnicode_GET_LENGTH(sep_obj);
11589
11590 switch(PyUnicode_KIND(str_in)) {
11591 case PyUnicode_1BYTE_KIND:
11592 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11593 break;
11594 case PyUnicode_2BYTE_KIND:
11595 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11596 break;
11597 case PyUnicode_4BYTE_KIND:
11598 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11599 break;
11600 default:
11601 assert(0);
11602 out = 0;
11603 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011604
11605 Py_DECREF(sep_obj);
11606 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (kind1 != kind)
11608 PyMem_Free(buf1);
11609 if (kind2 != kind)
11610 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611
11612 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 onError:
11614 Py_DECREF(sep_obj);
11615 Py_DECREF(str_obj);
11616 if (kind1 != kind && buf1)
11617 PyMem_Free(buf1);
11618 if (kind2 != kind && buf2)
11619 PyMem_Free(buf2);
11620 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621}
11622
11623
11624PyObject *
11625PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11626{
11627 PyObject* str_obj;
11628 PyObject* sep_obj;
11629 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 int kind1, kind2, kind;
11631 void *buf1 = NULL, *buf2 = NULL;
11632 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011633
11634 str_obj = PyUnicode_FromObject(str_in);
11635 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637 sep_obj = PyUnicode_FromObject(sep_in);
11638 if (!sep_obj) {
11639 Py_DECREF(str_obj);
11640 return NULL;
11641 }
11642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 kind1 = PyUnicode_KIND(str_in);
11644 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011645 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 buf1 = PyUnicode_DATA(str_in);
11647 if (kind1 != kind)
11648 buf1 = _PyUnicode_AsKind(str_in, kind);
11649 if (!buf1)
11650 goto onError;
11651 buf2 = PyUnicode_DATA(sep_obj);
11652 if (kind2 != kind)
11653 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11654 if (!buf2)
11655 goto onError;
11656 len1 = PyUnicode_GET_LENGTH(str_obj);
11657 len2 = PyUnicode_GET_LENGTH(sep_obj);
11658
11659 switch(PyUnicode_KIND(str_in)) {
11660 case PyUnicode_1BYTE_KIND:
11661 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11662 break;
11663 case PyUnicode_2BYTE_KIND:
11664 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11665 break;
11666 case PyUnicode_4BYTE_KIND:
11667 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11668 break;
11669 default:
11670 assert(0);
11671 out = 0;
11672 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011673
11674 Py_DECREF(sep_obj);
11675 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (kind1 != kind)
11677 PyMem_Free(buf1);
11678 if (kind2 != kind)
11679 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011680
11681 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 onError:
11683 Py_DECREF(sep_obj);
11684 Py_DECREF(str_obj);
11685 if (kind1 != kind && buf1)
11686 PyMem_Free(buf1);
11687 if (kind2 != kind && buf2)
11688 PyMem_Free(buf2);
11689 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011690}
11691
11692PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011694\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011695Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011696the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011697found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011698
11699static PyObject*
11700unicode_partition(PyUnicodeObject *self, PyObject *separator)
11701{
11702 return PyUnicode_Partition((PyObject *)self, separator);
11703}
11704
11705PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011706 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011707\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011708Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011709the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011710separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011711
11712static PyObject*
11713unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11714{
11715 return PyUnicode_RPartition((PyObject *)self, separator);
11716}
11717
Alexander Belopolsky40018472011-02-26 01:02:56 +000011718PyObject *
11719PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011720{
11721 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011722
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011723 s = PyUnicode_FromObject(s);
11724 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 if (sep != NULL) {
11727 sep = PyUnicode_FromObject(sep);
11728 if (sep == NULL) {
11729 Py_DECREF(s);
11730 return NULL;
11731 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011732 }
11733
11734 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11735
11736 Py_DECREF(s);
11737 Py_XDECREF(sep);
11738 return result;
11739}
11740
11741PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011743\n\
11744Return a list of the words in S, using sep as the\n\
11745delimiter string, starting at the end of the string and\n\
11746working to the front. If maxsplit is given, at most maxsplit\n\
11747splits are done. If sep is not specified, any whitespace string\n\
11748is a separator.");
11749
11750static PyObject*
11751unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11752{
11753 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011754 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011755
Martin v. Löwis18e16552006-02-15 17:27:45 +000011756 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011757 return NULL;
11758
11759 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011761 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011763 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011765}
11766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011767PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769\n\
11770Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011771Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011775unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011777 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011778 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011780 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11781 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 return NULL;
11783
Guido van Rossum86662912000-04-11 15:38:46 +000011784 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785}
11786
11787static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011788PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789{
Walter Dörwald346737f2007-05-31 10:44:43 +000011790 if (PyUnicode_CheckExact(self)) {
11791 Py_INCREF(self);
11792 return self;
11793 } else
11794 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011795 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800\n\
11801Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011802and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011805unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 return fixup(self, fixswapcase);
11808}
11809
Georg Brandlceee0772007-11-27 23:48:05 +000011810PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011812\n\
11813Return a translation table usable for str.translate().\n\
11814If there is only one argument, it must be a dictionary mapping Unicode\n\
11815ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011816Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011817If there are two arguments, they must be strings of equal length, and\n\
11818in the resulting dictionary, each character in x will be mapped to the\n\
11819character at the same position in y. If there is a third argument, it\n\
11820must be a string, whose characters will be mapped to None in the result.");
11821
11822static PyObject*
11823unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11824{
11825 PyObject *x, *y = NULL, *z = NULL;
11826 PyObject *new = NULL, *key, *value;
11827 Py_ssize_t i = 0;
11828 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011829
Georg Brandlceee0772007-11-27 23:48:05 +000011830 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11831 return NULL;
11832 new = PyDict_New();
11833 if (!new)
11834 return NULL;
11835 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 int x_kind, y_kind, z_kind;
11837 void *x_data, *y_data, *z_data;
11838
Georg Brandlceee0772007-11-27 23:48:05 +000011839 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011840 if (!PyUnicode_Check(x)) {
11841 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11842 "be a string if there is a second argument");
11843 goto err;
11844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011846 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11847 "arguments must have equal length");
11848 goto err;
11849 }
11850 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 x_kind = PyUnicode_KIND(x);
11852 y_kind = PyUnicode_KIND(y);
11853 x_data = PyUnicode_DATA(x);
11854 y_data = PyUnicode_DATA(y);
11855 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11856 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11857 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011858 if (!key || !value)
11859 goto err;
11860 res = PyDict_SetItem(new, key, value);
11861 Py_DECREF(key);
11862 Py_DECREF(value);
11863 if (res < 0)
11864 goto err;
11865 }
11866 /* create entries for deleting chars in z */
11867 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 z_kind = PyUnicode_KIND(z);
11869 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011870 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011872 if (!key)
11873 goto err;
11874 res = PyDict_SetItem(new, key, Py_None);
11875 Py_DECREF(key);
11876 if (res < 0)
11877 goto err;
11878 }
11879 }
11880 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 int kind;
11882 void *data;
11883
Georg Brandlceee0772007-11-27 23:48:05 +000011884 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011885 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011886 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11887 "to maketrans it must be a dict");
11888 goto err;
11889 }
11890 /* copy entries into the new dict, converting string keys to int keys */
11891 while (PyDict_Next(x, &i, &key, &value)) {
11892 if (PyUnicode_Check(key)) {
11893 /* convert string keys to integer keys */
11894 PyObject *newkey;
11895 if (PyUnicode_GET_SIZE(key) != 1) {
11896 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11897 "table must be of length 1");
11898 goto err;
11899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 kind = PyUnicode_KIND(key);
11901 data = PyUnicode_DATA(key);
11902 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011903 if (!newkey)
11904 goto err;
11905 res = PyDict_SetItem(new, newkey, value);
11906 Py_DECREF(newkey);
11907 if (res < 0)
11908 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011909 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011910 /* just keep integer keys */
11911 if (PyDict_SetItem(new, key, value) < 0)
11912 goto err;
11913 } else {
11914 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11915 "be strings or integers");
11916 goto err;
11917 }
11918 }
11919 }
11920 return new;
11921 err:
11922 Py_DECREF(new);
11923 return NULL;
11924}
11925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011926PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928\n\
11929Return a copy of the string S, where all characters have been mapped\n\
11930through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011931Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011932Unmapped characters are left untouched. Characters mapped to None\n\
11933are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939}
11940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011944Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
11946static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011947unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 return fixup(self, fixupper);
11950}
11951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011952PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011955Pad a numeric string S with zeros on the left, to fill a field\n\
11956of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958static PyObject *
11959unicode_zfill(PyUnicodeObject *self, PyObject *args)
11960{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011961 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011963 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 int kind;
11965 void *data;
11966 Py_UCS4 chr;
11967
11968 if (PyUnicode_READY(self) == -1)
11969 return NULL;
11970
Martin v. Löwis18e16552006-02-15 17:27:45 +000011971 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 return NULL;
11973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011975 if (PyUnicode_CheckExact(self)) {
11976 Py_INCREF(self);
11977 return (PyObject*) self;
11978 }
11979 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011980 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 }
11982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985 u = pad(self, fill, 0, '0');
11986
Walter Dörwald068325e2002-04-15 13:36:47 +000011987 if (u == NULL)
11988 return NULL;
11989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 kind = PyUnicode_KIND(u);
11991 data = PyUnicode_DATA(u);
11992 chr = PyUnicode_READ(kind, data, fill);
11993
11994 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 PyUnicode_WRITE(kind, data, 0, chr);
11997 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998 }
11999
12000 return (PyObject*) u;
12001}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
12003#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012004static PyObject *
12005unicode__decimal2ascii(PyObject *self)
12006{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012008}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009#endif
12010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012011PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012014Return True if S starts with the specified prefix, False otherwise.\n\
12015With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012016With optional end, stop comparing S at that position.\n\
12017prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
12019static PyObject *
12020unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012023 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012025 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012026 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012027 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Jesus Ceaac451502011-04-20 17:09:23 +020012029 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012031 if (PyTuple_Check(subobj)) {
12032 Py_ssize_t i;
12033 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12034 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012036 if (substring == NULL)
12037 return NULL;
12038 result = tailmatch(self, substring, start, end, -1);
12039 Py_DECREF(substring);
12040 if (result) {
12041 Py_RETURN_TRUE;
12042 }
12043 }
12044 /* nothing matched */
12045 Py_RETURN_FALSE;
12046 }
12047 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012048 if (substring == NULL) {
12049 if (PyErr_ExceptionMatches(PyExc_TypeError))
12050 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12051 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012053 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012054 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012056 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057}
12058
12059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012060PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012063Return True if S ends with the specified suffix, False otherwise.\n\
12064With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012065With optional end, stop comparing S at that position.\n\
12066suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
12068static PyObject *
12069unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012072 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012074 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012075 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012076 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
Jesus Ceaac451502011-04-20 17:09:23 +020012078 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012080 if (PyTuple_Check(subobj)) {
12081 Py_ssize_t i;
12082 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12083 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012085 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012087 result = tailmatch(self, substring, start, end, +1);
12088 Py_DECREF(substring);
12089 if (result) {
12090 Py_RETURN_TRUE;
12091 }
12092 }
12093 Py_RETURN_FALSE;
12094 }
12095 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012096 if (substring == NULL) {
12097 if (PyErr_ExceptionMatches(PyExc_TypeError))
12098 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12099 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012102 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012104 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012108
12109PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012111\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012112Return a formatted version of S, using substitutions from args and kwargs.\n\
12113The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012114
Eric Smith27bbca62010-11-04 17:06:58 +000012115PyDoc_STRVAR(format_map__doc__,
12116 "S.format_map(mapping) -> str\n\
12117\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012118Return a formatted version of S, using substitutions from mapping.\n\
12119The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012120
Eric Smith4a7d76d2008-05-30 18:10:19 +000012121static PyObject *
12122unicode__format__(PyObject* self, PyObject* args)
12123{
12124 PyObject *format_spec;
12125
12126 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12127 return NULL;
12128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12130 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012131}
12132
Eric Smith8c663262007-08-25 02:26:07 +000012133PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012135\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012136Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012137
12138static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012139unicode__sizeof__(PyUnicodeObject *v)
12140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 Py_ssize_t size;
12142
12143 /* If it's a compact object, account for base structure +
12144 character data. */
12145 if (PyUnicode_IS_COMPACT_ASCII(v))
12146 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12147 else if (PyUnicode_IS_COMPACT(v))
12148 size = sizeof(PyCompactUnicodeObject) +
12149 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12150 else {
12151 /* If it is a two-block object, account for base object, and
12152 for character block if present. */
12153 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012154 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 size += (PyUnicode_GET_LENGTH(v) + 1) *
12156 PyUnicode_CHARACTER_SIZE(v);
12157 }
12158 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012159 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012160 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012162 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012163 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164
12165 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012166}
12167
12168PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012170
12171static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012172unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012173{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012174 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (!copy)
12176 return NULL;
12177 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012178}
12179
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180static PyMethodDef unicode_methods[] = {
12181
12182 /* Order is according to common usage: often used methods should
12183 appear first, since lookup is done sequentially. */
12184
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012185 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012186 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12187 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012188 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012189 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12190 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12191 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12192 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12193 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12194 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12195 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012197 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12198 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12199 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012200 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012201 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12202 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12203 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012204 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012205 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012206 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012207 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012208 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12209 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12210 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12211 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12212 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12213 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12214 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12215 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12216 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12217 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12218 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12219 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12220 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12221 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012222 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012223 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012224 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012225 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012226 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012227 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012228 {"maketrans", (PyCFunction) unicode_maketrans,
12229 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012230 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012231#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012232 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233#endif
12234
12235#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012236 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012237 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238#endif
12239
Benjamin Peterson14339b62009-01-31 16:36:08 +000012240 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 {NULL, NULL}
12242};
12243
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012244static PyObject *
12245unicode_mod(PyObject *v, PyObject *w)
12246{
Brian Curtindfc80e32011-08-10 20:28:54 -050012247 if (!PyUnicode_Check(v))
12248 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012250}
12251
12252static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 0, /*nb_add*/
12254 0, /*nb_subtract*/
12255 0, /*nb_multiply*/
12256 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012257};
12258
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012260 (lenfunc) unicode_length, /* sq_length */
12261 PyUnicode_Concat, /* sq_concat */
12262 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12263 (ssizeargfunc) unicode_getitem, /* sq_item */
12264 0, /* sq_slice */
12265 0, /* sq_ass_item */
12266 0, /* sq_ass_slice */
12267 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268};
12269
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012270static PyObject*
12271unicode_subscript(PyUnicodeObject* self, PyObject* item)
12272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 if (PyUnicode_READY(self) == -1)
12274 return NULL;
12275
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012276 if (PyIndex_Check(item)) {
12277 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012278 if (i == -1 && PyErr_Occurred())
12279 return NULL;
12280 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012282 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012283 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012284 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012285 PyObject *result;
12286 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012287 int src_kind, dest_kind;
12288 Py_UCS4 ch, max_char;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012292 return NULL;
12293 }
12294
12295 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 return PyUnicode_New(0, 0);
12297 } else if (start == 0 && step == 1 &&
12298 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012299 PyUnicode_CheckExact(self)) {
12300 Py_INCREF(self);
12301 return (PyObject *)self;
12302 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012303 return PyUnicode_Substring((PyObject*)self,
12304 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012305 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012306 /* General case */
12307 max_char = 127;
12308 src_kind = PyUnicode_KIND(self);
12309 src_data = PyUnicode_DATA(self);
12310 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12311 ch = PyUnicode_READ(src_kind, src_data, cur);
12312 if (ch > max_char)
12313 max_char = ch;
12314 }
12315 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012316 if (result == NULL)
12317 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012318 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012319 dest_data = PyUnicode_DATA(result);
12320
12321 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012322 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12323 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012324 }
12325 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012326 } else {
12327 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12328 return NULL;
12329 }
12330}
12331
12332static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 (lenfunc)unicode_length, /* mp_length */
12334 (binaryfunc)unicode_subscript, /* mp_subscript */
12335 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012336};
12337
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339/* Helpers for PyUnicode_Format() */
12340
12341static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012342getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012344 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 (*p_argidx)++;
12347 if (arglen < 0)
12348 return args;
12349 else
12350 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351 }
12352 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354 return NULL;
12355}
12356
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012357/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012359static PyObject *
12360formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012362 char *p;
12363 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012365
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 x = PyFloat_AsDouble(v);
12367 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012368 return NULL;
12369
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012372
Eric Smith0923d1d2009-04-16 20:16:10 +000012373 p = PyOS_double_to_string(x, type, prec,
12374 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012375 if (p == NULL)
12376 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012378 PyMem_Free(p);
12379 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380}
12381
Tim Peters38fd5b62000-09-21 05:43:11 +000012382static PyObject*
12383formatlong(PyObject *val, int flags, int prec, int type)
12384{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 char *buf;
12386 int len;
12387 PyObject *str; /* temporary string object. */
12388 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012389
Benjamin Peterson14339b62009-01-31 16:36:08 +000012390 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12391 if (!str)
12392 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 Py_DECREF(str);
12395 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012396}
12397
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012400 size_t buflen,
12401 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012403 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012404 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 if (PyUnicode_GET_LENGTH(v) == 1) {
12406 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 buf[1] = '\0';
12408 return 1;
12409 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 goto onError;
12411 }
12412 else {
12413 /* Integer input truncated to a character */
12414 long x;
12415 x = PyLong_AsLong(v);
12416 if (x == -1 && PyErr_Occurred())
12417 goto onError;
12418
12419 if (x < 0 || x > 0x10ffff) {
12420 PyErr_SetString(PyExc_OverflowError,
12421 "%c arg not in range(0x110000)");
12422 return -1;
12423 }
12424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 buf[1] = '\0';
12427 return 1;
12428 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012429
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012431 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012432 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012433 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434}
12435
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012436/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012437 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012438*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012439#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012440
Alexander Belopolsky40018472011-02-26 01:02:56 +000012441PyObject *
12442PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 void *fmt;
12445 int fmtkind;
12446 PyObject *result;
12447 Py_UCS4 *res, *res0;
12448 Py_UCS4 max;
12449 int kind;
12450 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012454
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012456 PyErr_BadInternalCall();
12457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12460 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012461 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 fmt = PyUnicode_DATA(uformat);
12463 fmtkind = PyUnicode_KIND(uformat);
12464 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12465 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
12467 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12469 if (res0 == NULL) {
12470 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473
12474 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 arglen = PyTuple_Size(args);
12476 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 }
12478 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 arglen = -1;
12480 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012482 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012483 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
12486 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 if (--rescnt < 0) {
12489 rescnt = fmtcnt + 100;
12490 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12492 if (res0 == NULL){
12493 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012494 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 }
12496 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 }
12501 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 /* Got a format specifier */
12503 int flags = 0;
12504 Py_ssize_t width = -1;
12505 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 Py_UCS4 c = '\0';
12507 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012508 int isnumok;
12509 PyObject *v = NULL;
12510 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 void *pbuf;
12512 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 Py_ssize_t len, len1;
12515 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 fmtpos++;
12518 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12519 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 Py_ssize_t keylen;
12521 PyObject *key;
12522 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012523
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 if (dict == NULL) {
12525 PyErr_SetString(PyExc_TypeError,
12526 "format requires a mapping");
12527 goto onError;
12528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 /* Skip over balanced parentheses */
12533 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012537 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 if (fmtcnt < 0 || pcount > 0) {
12542 PyErr_SetString(PyExc_ValueError,
12543 "incomplete format key");
12544 goto onError;
12545 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012546 key = PyUnicode_Substring((PyObject*)uformat,
12547 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 if (key == NULL)
12549 goto onError;
12550 if (args_owned) {
12551 Py_DECREF(args);
12552 args_owned = 0;
12553 }
12554 args = PyObject_GetItem(dict, key);
12555 Py_DECREF(key);
12556 if (args == NULL) {
12557 goto onError;
12558 }
12559 args_owned = 1;
12560 arglen = -1;
12561 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 case '-': flags |= F_LJUST; continue;
12566 case '+': flags |= F_SIGN; continue;
12567 case ' ': flags |= F_BLANK; continue;
12568 case '#': flags |= F_ALT; continue;
12569 case '0': flags |= F_ZERO; continue;
12570 }
12571 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 if (c == '*') {
12574 v = getnextarg(args, arglen, &argidx);
12575 if (v == NULL)
12576 goto onError;
12577 if (!PyLong_Check(v)) {
12578 PyErr_SetString(PyExc_TypeError,
12579 "* wants int");
12580 goto onError;
12581 }
12582 width = PyLong_AsLong(v);
12583 if (width == -1 && PyErr_Occurred())
12584 goto onError;
12585 if (width < 0) {
12586 flags |= F_LJUST;
12587 width = -width;
12588 }
12589 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 }
12592 else if (c >= '0' && c <= '9') {
12593 width = c - '0';
12594 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 if (c < '0' || c > '9')
12597 break;
12598 if ((width*10) / 10 != width) {
12599 PyErr_SetString(PyExc_ValueError,
12600 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 }
12603 width = width*10 + (c - '0');
12604 }
12605 }
12606 if (c == '.') {
12607 prec = 0;
12608 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 if (c == '*') {
12611 v = getnextarg(args, arglen, &argidx);
12612 if (v == NULL)
12613 goto onError;
12614 if (!PyLong_Check(v)) {
12615 PyErr_SetString(PyExc_TypeError,
12616 "* wants int");
12617 goto onError;
12618 }
12619 prec = PyLong_AsLong(v);
12620 if (prec == -1 && PyErr_Occurred())
12621 goto onError;
12622 if (prec < 0)
12623 prec = 0;
12624 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 }
12627 else if (c >= '0' && c <= '9') {
12628 prec = c - '0';
12629 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 if (c < '0' || c > '9')
12632 break;
12633 if ((prec*10) / 10 != prec) {
12634 PyErr_SetString(PyExc_ValueError,
12635 "prec too big");
12636 goto onError;
12637 }
12638 prec = prec*10 + (c - '0');
12639 }
12640 }
12641 } /* prec */
12642 if (fmtcnt >= 0) {
12643 if (c == 'h' || c == 'l' || c == 'L') {
12644 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 }
12647 }
12648 if (fmtcnt < 0) {
12649 PyErr_SetString(PyExc_ValueError,
12650 "incomplete format");
12651 goto onError;
12652 }
12653 if (c != '%') {
12654 v = getnextarg(args, arglen, &argidx);
12655 if (v == NULL)
12656 goto onError;
12657 }
12658 sign = 0;
12659 fill = ' ';
12660 switch (c) {
12661
12662 case '%':
12663 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 len = 1;
12668 break;
12669
12670 case 's':
12671 case 'r':
12672 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012673 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 temp = v;
12675 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 }
12677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 if (c == 's')
12679 temp = PyObject_Str(v);
12680 else if (c == 'r')
12681 temp = PyObject_Repr(v);
12682 else
12683 temp = PyObject_ASCII(v);
12684 if (temp == NULL)
12685 goto onError;
12686 if (PyUnicode_Check(temp))
12687 /* nothing to do */;
12688 else {
12689 Py_DECREF(temp);
12690 PyErr_SetString(PyExc_TypeError,
12691 "%s argument has non-string str()");
12692 goto onError;
12693 }
12694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 if (PyUnicode_READY(temp) == -1) {
12696 Py_CLEAR(temp);
12697 goto onError;
12698 }
12699 pbuf = PyUnicode_DATA(temp);
12700 kind = PyUnicode_KIND(temp);
12701 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 if (prec >= 0 && len > prec)
12703 len = prec;
12704 break;
12705
12706 case 'i':
12707 case 'd':
12708 case 'u':
12709 case 'o':
12710 case 'x':
12711 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 isnumok = 0;
12713 if (PyNumber_Check(v)) {
12714 PyObject *iobj=NULL;
12715
12716 if (PyLong_Check(v)) {
12717 iobj = v;
12718 Py_INCREF(iobj);
12719 }
12720 else {
12721 iobj = PyNumber_Long(v);
12722 }
12723 if (iobj!=NULL) {
12724 if (PyLong_Check(iobj)) {
12725 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012726 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 Py_DECREF(iobj);
12728 if (!temp)
12729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 if (PyUnicode_READY(temp) == -1) {
12731 Py_CLEAR(temp);
12732 goto onError;
12733 }
12734 pbuf = PyUnicode_DATA(temp);
12735 kind = PyUnicode_KIND(temp);
12736 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 sign = 1;
12738 }
12739 else {
12740 Py_DECREF(iobj);
12741 }
12742 }
12743 }
12744 if (!isnumok) {
12745 PyErr_Format(PyExc_TypeError,
12746 "%%%c format: a number is required, "
12747 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12748 goto onError;
12749 }
12750 if (flags & F_ZERO)
12751 fill = '0';
12752 break;
12753
12754 case 'e':
12755 case 'E':
12756 case 'f':
12757 case 'F':
12758 case 'g':
12759 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012760 temp = formatfloat(v, flags, prec, c);
12761 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 if (PyUnicode_READY(temp) == -1) {
12764 Py_CLEAR(temp);
12765 goto onError;
12766 }
12767 pbuf = PyUnicode_DATA(temp);
12768 kind = PyUnicode_KIND(temp);
12769 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 sign = 1;
12771 if (flags & F_ZERO)
12772 fill = '0';
12773 break;
12774
12775 case 'c':
12776 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012778 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 if (len < 0)
12780 goto onError;
12781 break;
12782
12783 default:
12784 PyErr_Format(PyExc_ValueError,
12785 "unsupported format character '%c' (0x%x) "
12786 "at index %zd",
12787 (31<=c && c<=126) ? (char)c : '?',
12788 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012790 goto onError;
12791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 /* pbuf is initialized here. */
12793 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12796 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12797 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 len--;
12799 }
12800 else if (flags & F_SIGN)
12801 sign = '+';
12802 else if (flags & F_BLANK)
12803 sign = ' ';
12804 else
12805 sign = 0;
12806 }
12807 if (width < len)
12808 width = len;
12809 if (rescnt - (sign != 0) < width) {
12810 reslen -= rescnt;
12811 rescnt = width + fmtcnt + 100;
12812 reslen += rescnt;
12813 if (reslen < 0) {
12814 Py_XDECREF(temp);
12815 PyErr_NoMemory();
12816 goto onError;
12817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12819 if (res0 == 0) {
12820 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 Py_XDECREF(temp);
12822 goto onError;
12823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 }
12826 if (sign) {
12827 if (fill != ' ')
12828 *res++ = sign;
12829 rescnt--;
12830 if (width > len)
12831 width--;
12832 }
12833 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12835 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12838 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 }
12840 rescnt -= 2;
12841 width -= 2;
12842 if (width < 0)
12843 width = 0;
12844 len -= 2;
12845 }
12846 if (width > len && !(flags & F_LJUST)) {
12847 do {
12848 --rescnt;
12849 *res++ = fill;
12850 } while (--width > len);
12851 }
12852 if (fill == ' ') {
12853 if (sign)
12854 *res++ = sign;
12855 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12857 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12858 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12859 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012860 }
12861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862 /* Copy all characters, preserving len */
12863 len1 = len;
12864 while (len1--) {
12865 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12866 rescnt--;
12867 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 while (--width >= len) {
12869 --rescnt;
12870 *res++ = ' ';
12871 }
12872 if (dict && (argidx < arglen) && c != '%') {
12873 PyErr_SetString(PyExc_TypeError,
12874 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012875 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 goto onError;
12877 }
12878 Py_XDECREF(temp);
12879 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880 } /* until end */
12881 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 PyErr_SetString(PyExc_TypeError,
12883 "not all arguments converted during string formatting");
12884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885 }
12886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887
12888 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12889 if (*res > max)
12890 max = *res;
12891 result = PyUnicode_New(reslen - rescnt, max);
12892 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 kind = PyUnicode_KIND(result);
12895 for (res = res0; res < res0+reslen-rescnt; res++)
12896 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12897 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900 }
12901 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902 return (PyObject *)result;
12903
Benjamin Peterson29060642009-01-31 22:14:21 +000012904 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906 Py_DECREF(uformat);
12907 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909 }
12910 return NULL;
12911}
12912
Jeremy Hylton938ace62002-07-17 16:30:39 +000012913static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012914unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12915
Tim Peters6d6c1a32001-08-02 04:15:00 +000012916static PyObject *
12917unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12918{
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012920 static char *kwlist[] = {"object", "encoding", "errors", 0};
12921 char *encoding = NULL;
12922 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012923
Benjamin Peterson14339b62009-01-31 16:36:08 +000012924 if (type != &PyUnicode_Type)
12925 return unicode_subtype_new(type, args, kwds);
12926 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 return NULL;
12929 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012931 if (encoding == NULL && errors == NULL)
12932 return PyObject_Str(x);
12933 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012935}
12936
Guido van Rossume023fe02001-08-30 03:12:59 +000012937static PyObject *
12938unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12939{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012940 PyUnicodeObject *unicode, *self;
12941 Py_ssize_t length, char_size;
12942 int share_wstr, share_utf8;
12943 unsigned int kind;
12944 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012945
Benjamin Peterson14339b62009-01-31 16:36:08 +000012946 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012947
12948 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12949 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012950 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012951 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020012952 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012953 return NULL;
12954
12955 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12956 if (self == NULL) {
12957 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 return NULL;
12959 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012960 kind = PyUnicode_KIND(unicode);
12961 length = PyUnicode_GET_LENGTH(unicode);
12962
12963 _PyUnicode_LENGTH(self) = length;
12964 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12965 _PyUnicode_STATE(self).interned = 0;
12966 _PyUnicode_STATE(self).kind = kind;
12967 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012968 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012969 _PyUnicode_STATE(self).ready = 1;
12970 _PyUnicode_WSTR(self) = NULL;
12971 _PyUnicode_UTF8_LENGTH(self) = 0;
12972 _PyUnicode_UTF8(self) = NULL;
12973 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012974 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012975
12976 share_utf8 = 0;
12977 share_wstr = 0;
12978 if (kind == PyUnicode_1BYTE_KIND) {
12979 char_size = 1;
12980 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12981 share_utf8 = 1;
12982 }
12983 else if (kind == PyUnicode_2BYTE_KIND) {
12984 char_size = 2;
12985 if (sizeof(wchar_t) == 2)
12986 share_wstr = 1;
12987 }
12988 else {
12989 assert(kind == PyUnicode_4BYTE_KIND);
12990 char_size = 4;
12991 if (sizeof(wchar_t) == 4)
12992 share_wstr = 1;
12993 }
12994
12995 /* Ensure we won't overflow the length. */
12996 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12997 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012999 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013000 data = PyObject_MALLOC((length + 1) * char_size);
13001 if (data == NULL) {
13002 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 goto onError;
13004 }
13005
Victor Stinnerc3c74152011-10-02 20:39:55 +020013006 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013007 if (share_utf8) {
13008 _PyUnicode_UTF8_LENGTH(self) = length;
13009 _PyUnicode_UTF8(self) = data;
13010 }
13011 if (share_wstr) {
13012 _PyUnicode_WSTR_LENGTH(self) = length;
13013 _PyUnicode_WSTR(self) = (wchar_t *)data;
13014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013016 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13017 PyUnicode_KIND_SIZE(kind, length + 1));
13018 Py_DECREF(unicode);
13019 return (PyObject *)self;
13020
13021onError:
13022 Py_DECREF(unicode);
13023 Py_DECREF(self);
13024 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013025}
13026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013027PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013029\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013030Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013031encoding defaults to the current default string encoding.\n\
13032errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013033
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013034static PyObject *unicode_iter(PyObject *seq);
13035
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013037 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013038 "str", /* tp_name */
13039 sizeof(PyUnicodeObject), /* tp_size */
13040 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013042 (destructor)unicode_dealloc, /* tp_dealloc */
13043 0, /* tp_print */
13044 0, /* tp_getattr */
13045 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013046 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013047 unicode_repr, /* tp_repr */
13048 &unicode_as_number, /* tp_as_number */
13049 &unicode_as_sequence, /* tp_as_sequence */
13050 &unicode_as_mapping, /* tp_as_mapping */
13051 (hashfunc) unicode_hash, /* tp_hash*/
13052 0, /* tp_call*/
13053 (reprfunc) unicode_str, /* tp_str */
13054 PyObject_GenericGetAttr, /* tp_getattro */
13055 0, /* tp_setattro */
13056 0, /* tp_as_buffer */
13057 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 unicode_doc, /* tp_doc */
13060 0, /* tp_traverse */
13061 0, /* tp_clear */
13062 PyUnicode_RichCompare, /* tp_richcompare */
13063 0, /* tp_weaklistoffset */
13064 unicode_iter, /* tp_iter */
13065 0, /* tp_iternext */
13066 unicode_methods, /* tp_methods */
13067 0, /* tp_members */
13068 0, /* tp_getset */
13069 &PyBaseObject_Type, /* tp_base */
13070 0, /* tp_dict */
13071 0, /* tp_descr_get */
13072 0, /* tp_descr_set */
13073 0, /* tp_dictoffset */
13074 0, /* tp_init */
13075 0, /* tp_alloc */
13076 unicode_new, /* tp_new */
13077 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078};
13079
13080/* Initialize the Unicode implementation */
13081
Thomas Wouters78890102000-07-22 19:25:51 +000013082void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013084 int i;
13085
Thomas Wouters477c8d52006-05-27 19:21:47 +000013086 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088 0x000A, /* LINE FEED */
13089 0x000D, /* CARRIAGE RETURN */
13090 0x001C, /* FILE SEPARATOR */
13091 0x001D, /* GROUP SEPARATOR */
13092 0x001E, /* RECORD SEPARATOR */
13093 0x0085, /* NEXT LINE */
13094 0x2028, /* LINE SEPARATOR */
13095 0x2029, /* PARAGRAPH SEPARATOR */
13096 };
13097
Fred Drakee4315f52000-05-09 19:53:39 +000013098 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013099 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013100 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013102
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013103 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013105 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013107
13108 /* initialize the linebreak bloom filter */
13109 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013111 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013112
13113 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
13116/* Finalize the Unicode implementation */
13117
Christian Heimesa156e092008-02-16 07:38:31 +000013118int
13119PyUnicode_ClearFreeList(void)
13120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013122}
13123
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124void
Thomas Wouters78890102000-07-22 19:25:51 +000013125_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013127 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013129 Py_XDECREF(unicode_empty);
13130 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013131
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013132 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 if (unicode_latin1[i]) {
13134 Py_DECREF(unicode_latin1[i]);
13135 unicode_latin1[i] = NULL;
13136 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013137 }
Christian Heimesa156e092008-02-16 07:38:31 +000013138 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013140
Walter Dörwald16807132007-05-25 13:52:07 +000013141void
13142PyUnicode_InternInPlace(PyObject **p)
13143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13145 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013146#ifdef Py_DEBUG
13147 assert(s != NULL);
13148 assert(_PyUnicode_CHECK(s));
13149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013151 return;
13152#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013153 /* If it's a subclass, we don't really know what putting
13154 it in the interned dict might do. */
13155 if (!PyUnicode_CheckExact(s))
13156 return;
13157 if (PyUnicode_CHECK_INTERNED(s))
13158 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013159 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013160 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 return;
13162 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013163 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013164 if (interned == NULL) {
13165 interned = PyDict_New();
13166 if (interned == NULL) {
13167 PyErr_Clear(); /* Don't leave an exception */
13168 return;
13169 }
13170 }
13171 /* It might be that the GetItem call fails even
13172 though the key is present in the dictionary,
13173 namely when this happens during a stack overflow. */
13174 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013177
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 if (t) {
13179 Py_INCREF(t);
13180 Py_DECREF(*p);
13181 *p = t;
13182 return;
13183 }
Walter Dörwald16807132007-05-25 13:52:07 +000013184
Benjamin Peterson14339b62009-01-31 16:36:08 +000013185 PyThreadState_GET()->recursion_critical = 1;
13186 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13187 PyErr_Clear();
13188 PyThreadState_GET()->recursion_critical = 0;
13189 return;
13190 }
13191 PyThreadState_GET()->recursion_critical = 0;
13192 /* The two references in interned are not counted by refcnt.
13193 The deallocator will take care of this */
13194 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013196}
13197
13198void
13199PyUnicode_InternImmortal(PyObject **p)
13200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13202
Benjamin Peterson14339b62009-01-31 16:36:08 +000013203 PyUnicode_InternInPlace(p);
13204 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 Py_INCREF(*p);
13207 }
Walter Dörwald16807132007-05-25 13:52:07 +000013208}
13209
13210PyObject *
13211PyUnicode_InternFromString(const char *cp)
13212{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013213 PyObject *s = PyUnicode_FromString(cp);
13214 if (s == NULL)
13215 return NULL;
13216 PyUnicode_InternInPlace(&s);
13217 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013218}
13219
Alexander Belopolsky40018472011-02-26 01:02:56 +000013220void
13221_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013222{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 PyObject *keys;
13224 PyUnicodeObject *s;
13225 Py_ssize_t i, n;
13226 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013227
Benjamin Peterson14339b62009-01-31 16:36:08 +000013228 if (interned == NULL || !PyDict_Check(interned))
13229 return;
13230 keys = PyDict_Keys(interned);
13231 if (keys == NULL || !PyList_Check(keys)) {
13232 PyErr_Clear();
13233 return;
13234 }
Walter Dörwald16807132007-05-25 13:52:07 +000013235
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13237 detector, interned unicode strings are not forcibly deallocated;
13238 rather, we give them their stolen references back, and then clear
13239 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013240
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 n = PyList_GET_SIZE(keys);
13242 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 for (i = 0; i < n; i++) {
13245 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013246 if (PyUnicode_READY(s) == -1) {
13247 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013251 case SSTATE_NOT_INTERNED:
13252 /* XXX Shouldn't happen */
13253 break;
13254 case SSTATE_INTERNED_IMMORTAL:
13255 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013257 break;
13258 case SSTATE_INTERNED_MORTAL:
13259 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 break;
13262 default:
13263 Py_FatalError("Inconsistent interned string state.");
13264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 }
13267 fprintf(stderr, "total size of all interned strings: "
13268 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13269 "mortal/immortal\n", mortal_size, immortal_size);
13270 Py_DECREF(keys);
13271 PyDict_Clear(interned);
13272 Py_DECREF(interned);
13273 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013274}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013275
13276
13277/********************* Unicode Iterator **************************/
13278
13279typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013280 PyObject_HEAD
13281 Py_ssize_t it_index;
13282 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013283} unicodeiterobject;
13284
13285static void
13286unicodeiter_dealloc(unicodeiterobject *it)
13287{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 _PyObject_GC_UNTRACK(it);
13289 Py_XDECREF(it->it_seq);
13290 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013291}
13292
13293static int
13294unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13295{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 Py_VISIT(it->it_seq);
13297 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013298}
13299
13300static PyObject *
13301unicodeiter_next(unicodeiterobject *it)
13302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 PyUnicodeObject *seq;
13304 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013305
Benjamin Peterson14339b62009-01-31 16:36:08 +000013306 assert(it != NULL);
13307 seq = it->it_seq;
13308 if (seq == NULL)
13309 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013310 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13313 int kind = PyUnicode_KIND(seq);
13314 void *data = PyUnicode_DATA(seq);
13315 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13316 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013317 if (item != NULL)
13318 ++it->it_index;
13319 return item;
13320 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013321
Benjamin Peterson14339b62009-01-31 16:36:08 +000013322 Py_DECREF(seq);
13323 it->it_seq = NULL;
13324 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013325}
13326
13327static PyObject *
13328unicodeiter_len(unicodeiterobject *it)
13329{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013330 Py_ssize_t len = 0;
13331 if (it->it_seq)
13332 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13333 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013334}
13335
13336PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13337
13338static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013339 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013342};
13343
13344PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13346 "str_iterator", /* tp_name */
13347 sizeof(unicodeiterobject), /* tp_basicsize */
13348 0, /* tp_itemsize */
13349 /* methods */
13350 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13351 0, /* tp_print */
13352 0, /* tp_getattr */
13353 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013354 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 0, /* tp_repr */
13356 0, /* tp_as_number */
13357 0, /* tp_as_sequence */
13358 0, /* tp_as_mapping */
13359 0, /* tp_hash */
13360 0, /* tp_call */
13361 0, /* tp_str */
13362 PyObject_GenericGetAttr, /* tp_getattro */
13363 0, /* tp_setattro */
13364 0, /* tp_as_buffer */
13365 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13366 0, /* tp_doc */
13367 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13368 0, /* tp_clear */
13369 0, /* tp_richcompare */
13370 0, /* tp_weaklistoffset */
13371 PyObject_SelfIter, /* tp_iter */
13372 (iternextfunc)unicodeiter_next, /* tp_iternext */
13373 unicodeiter_methods, /* tp_methods */
13374 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013375};
13376
13377static PyObject *
13378unicode_iter(PyObject *seq)
13379{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013380 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013381
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 if (!PyUnicode_Check(seq)) {
13383 PyErr_BadInternalCall();
13384 return NULL;
13385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 if (PyUnicode_READY(seq) == -1)
13387 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013388 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13389 if (it == NULL)
13390 return NULL;
13391 it->it_index = 0;
13392 Py_INCREF(seq);
13393 it->it_seq = (PyUnicodeObject *)seq;
13394 _PyObject_GC_TRACK(it);
13395 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013396}
13397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398#define UNIOP(x) Py_UNICODE_##x
13399#define UNIOP_t Py_UNICODE
13400#include "uniops.h"
13401#undef UNIOP
13402#undef UNIOP_t
13403#define UNIOP(x) Py_UCS4_##x
13404#define UNIOP_t Py_UCS4
13405#include "uniops.h"
13406#undef UNIOP
13407#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013408
Victor Stinner71133ff2010-09-01 23:43:53 +000013409Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013410PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013411{
13412 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13413 Py_UNICODE *copy;
13414 Py_ssize_t size;
13415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 if (!PyUnicode_Check(unicode)) {
13417 PyErr_BadArgument();
13418 return NULL;
13419 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013420 /* Ensure we won't overflow the size. */
13421 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13422 PyErr_NoMemory();
13423 return NULL;
13424 }
13425 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13426 size *= sizeof(Py_UNICODE);
13427 copy = PyMem_Malloc(size);
13428 if (copy == NULL) {
13429 PyErr_NoMemory();
13430 return NULL;
13431 }
13432 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13433 return copy;
13434}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013435
Georg Brandl66c221e2010-10-14 07:04:07 +000013436/* A _string module, to export formatter_parser and formatter_field_name_split
13437 to the string.Formatter class implemented in Python. */
13438
13439static PyMethodDef _string_methods[] = {
13440 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13441 METH_O, PyDoc_STR("split the argument as a field name")},
13442 {"formatter_parser", (PyCFunction) formatter_parser,
13443 METH_O, PyDoc_STR("parse the argument as a format string")},
13444 {NULL, NULL}
13445};
13446
13447static struct PyModuleDef _string_module = {
13448 PyModuleDef_HEAD_INIT,
13449 "_string",
13450 PyDoc_STR("string helper module"),
13451 0,
13452 _string_methods,
13453 NULL,
13454 NULL,
13455 NULL,
13456 NULL
13457};
13458
13459PyMODINIT_FUNC
13460PyInit__string(void)
13461{
13462 return PyModule_Create(&_string_module);
13463}
13464
13465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013466#ifdef __cplusplus
13467}
13468#endif