blob: 42d061ac425a983e85193551ca57900c7a978c3b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner03490912011-10-03 23:45:12 +0200154/* true if the Unicode object has an allocated wstr memory block
155 (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op) \
157 (assert(_PyUnicode_CHECK(op)), \
158 (_PyUnicode_WSTR(op) && \
159 (!PyUnicode_IS_READY(op) || \
160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
Victor Stinner910337b2011-10-03 03:20:16 +0200162/* Generic helper macro to convert characters of different types.
163 from_type and to_type have to be valid type names, begin and end
164 are pointers to the source characters which should be of type
165 "from_type *". to is a pointer of type "to_type *" and points to the
166 buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168 do { \
169 const from_type *iter_; to_type *to_; \
170 for (iter_ = (begin), to_ = (to_type *)(to); \
171 iter_ < (end); \
172 ++iter_, ++to_) { \
173 *to_ = (to_type)*iter_; \
174 } \
175 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200176
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
Walter Dörwald16807132007-05-25 13:52:07 +0000180/* This dictionary holds all interned unicode strings. Note that references
181 to strings in this dictionary are *not* counted in the string's ob_refcnt.
182 When the interned string reaches a refcnt of 0 the string deallocation
183 function will delete the reference from this dictionary.
184
185 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000187*/
188static PyObject *interned;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200191static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200231
Alexander Belopolsky40018472011-02-26 01:02:56 +0000232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 PyObject **errorHandler,const char *encoding, const char *reason,
235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
Alexander Belopolsky40018472011-02-26 01:02:56 +0000238static void
239raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300240 const char *encoding,
241 const Py_UNICODE *unicode, Py_ssize_t size,
242 Py_ssize_t startpos, Py_ssize_t endpos,
243 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000244
Christian Heimes190d79e2008-01-30 11:58:22 +0000245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000248/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000249/* 0x000B, * LINE TABULATION */
250/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000251/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000252 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x001C, * FILE SEPARATOR */
255/* 0x001D, * GROUP SEPARATOR */
256/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 1, 1, 1, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000271};
272
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000276PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000277{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000278#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 /* This is actually an illegal character, so it should
282 not be passed to unichr. */
283 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#endif
285}
286
Victor Stinner910337b2011-10-03 03:20:16 +0200287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291 PyASCIIObject *ascii;
292 unsigned int kind;
293
294 assert(PyUnicode_Check(op));
295
296 ascii = (PyASCIIObject *)op;
297 kind = ascii->state.kind;
298
Victor Stinnera3b334d2011-10-03 13:53:37 +0200299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200300 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200301 assert(ascii->state.ready == 1);
302 }
303 else if (ascii->state.compact == 1) {
Victor Stinner85041a52011-10-03 14:42:39 +0200304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200305 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND
307 || kind == PyUnicode_2BYTE_KIND
308 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(ascii->state.ascii == 0);
310 assert(ascii->state.ready == 1);
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 data = compact + 1;
312 assert (compact->utf8 != data);
313 if (
314#if SIZEOF_WCHAR_T == 2
315 kind == PyUnicode_2BYTE_KIND
316#else
317 kind == PyUnicode_4BYTE_KIND
318#endif
319 )
320 assert(ascii->wstr == data);
321 else
322 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 } else {
324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
326
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200328 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200330 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->wstr != NULL);
332 assert(unicode->data.any == NULL);
333 assert(compact->utf8 == NULL);
334 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
335 }
336 else {
337 assert(kind == PyUnicode_1BYTE_KIND
338 || kind == PyUnicode_2BYTE_KIND
339 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200340 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 assert(unicode->data.any != NULL);
Victor Stinner85041a52011-10-03 14:42:39 +0200343 if (ascii->state.ascii)
344 assert (compact->utf8 == unicode->data.any);
345 else
346 assert (compact->utf8 != unicode->data.any);
Victor Stinner7f11ad42011-10-04 00:00:20 +0200347 if (
348#if SIZEOF_WCHAR_T == 2
349 kind == PyUnicode_2BYTE_KIND
350#else
351 kind == PyUnicode_4BYTE_KIND
352#endif
353 )
354 assert(ascii->wstr == unicode->data.any);
355 else
356 assert(ascii->wstr != unicode->data.any);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
358 }
359 return 1;
360}
361#endif
362
Thomas Wouters477c8d52006-05-27 19:21:47 +0000363/* --- Bloom Filters ----------------------------------------------------- */
364
365/* stuff to implement simple "bloom filters" for Unicode characters.
366 to keep things simple, we use a single bitmask, using the least 5
367 bits from each unicode characters as the bit index. */
368
369/* the linebreak mask is set up by Unicode_Init below */
370
Antoine Pitrouf068f942010-01-13 14:19:12 +0000371#if LONG_BIT >= 128
372#define BLOOM_WIDTH 128
373#elif LONG_BIT >= 64
374#define BLOOM_WIDTH 64
375#elif LONG_BIT >= 32
376#define BLOOM_WIDTH 32
377#else
378#error "LONG_BIT is smaller than 32"
379#endif
380
Thomas Wouters477c8d52006-05-27 19:21:47 +0000381#define BLOOM_MASK unsigned long
382
383static BLOOM_MASK bloom_linebreak;
384
Antoine Pitrouf068f942010-01-13 14:19:12 +0000385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
386#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000387
Benjamin Peterson29060642009-01-31 22:14:21 +0000388#define BLOOM_LINEBREAK(ch) \
389 ((ch) < 128U ? ascii_linebreak[(ch)] : \
390 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394{
395 /* calculate simple bloom-style bitmask for a given unicode string */
396
Antoine Pitrouf068f942010-01-13 14:19:12 +0000397 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 Py_ssize_t i;
399
400 mask = 0;
401 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000403
404 return mask;
405}
406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200407#define BLOOM_MEMBER(mask, chr, str) \
408 (BLOOM(mask, chr) \
409 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000410
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411/* --- Unicode Object ----------------------------------------------------- */
412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
415
416Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
417 Py_ssize_t size, Py_UCS4 ch,
418 int direction)
419{
420 /* like wcschr, but doesn't stop at NULL characters */
421 Py_ssize_t i;
422 if (direction == 1) {
423 for(i = 0; i < size; i++)
424 if (PyUnicode_READ(kind, s, i) == ch)
425 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
426 }
427 else {
428 for(i = size-1; i >= 0; i--)
429 if (PyUnicode_READ(kind, s, i) == ch)
430 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
431 }
432 return NULL;
433}
434
Victor Stinnerfe226c02011-10-03 03:52:20 +0200435static PyObject*
436resize_compact(PyObject *unicode, Py_ssize_t length)
437{
438 Py_ssize_t char_size;
439 Py_ssize_t struct_size;
440 Py_ssize_t new_size;
441 int share_wstr;
442
443 assert(PyUnicode_IS_READY(unicode));
444 char_size = PyUnicode_CHARACTER_SIZE(unicode);
445 if (PyUnicode_IS_COMPACT_ASCII(unicode))
446 struct_size = sizeof(PyASCIIObject);
447 else
448 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200449 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200450
451 _Py_DEC_REFTOTAL;
452 _Py_ForgetReference(unicode);
453
454 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
455 PyErr_NoMemory();
456 return NULL;
457 }
458 new_size = (struct_size + (length + 1) * char_size);
459
460 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
461 if (unicode == NULL) {
462 PyObject_Del(unicode);
463 PyErr_NoMemory();
464 return NULL;
465 }
466 _Py_NewReference(unicode);
467 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200468 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200469 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200470 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
471 _PyUnicode_WSTR_LENGTH(unicode) = length;
472 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200473 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
474 length, 0);
475 return unicode;
476}
477
Alexander Belopolsky40018472011-02-26 01:02:56 +0000478static int
Victor Stinner95663112011-10-04 01:03:50 +0200479resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480{
Victor Stinner95663112011-10-04 01:03:50 +0200481 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200482 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000484
Victor Stinner95663112011-10-04 01:03:50 +0200485 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486
487 if (PyUnicode_IS_READY(unicode)) {
488 Py_ssize_t char_size;
489 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200490 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200491 void *data;
492
493 data = _PyUnicode_DATA_ANY(unicode);
494 assert(data != NULL);
495 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200496 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
497 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200498 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
499 {
500 PyObject_DEL(_PyUnicode_UTF8(unicode));
501 _PyUnicode_UTF8(unicode) = NULL;
502 _PyUnicode_UTF8_LENGTH(unicode) = 0;
503 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200504
505 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
506 PyErr_NoMemory();
507 return -1;
508 }
509 new_size = (length + 1) * char_size;
510
511 data = (PyObject *)PyObject_REALLOC(data, new_size);
512 if (data == NULL) {
513 PyErr_NoMemory();
514 return -1;
515 }
516 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200517 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200518 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
521 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200522 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200523 _PyUnicode_UTF8_LENGTH(unicode) = length;
524 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525 _PyUnicode_LENGTH(unicode) = length;
526 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200527 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
528 _PyUnicode_CHECK(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200529 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200530 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 }
Victor Stinner95663112011-10-04 01:03:50 +0200532 assert(_PyUnicode_WSTR(unicode) != NULL);
533
534 /* check for integer overflow */
535 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
536 PyErr_NoMemory();
537 return -1;
538 }
539 wstr = _PyUnicode_WSTR(unicode);
540 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
541 if (!wstr) {
542 PyErr_NoMemory();
543 return -1;
544 }
545 _PyUnicode_WSTR(unicode) = wstr;
546 _PyUnicode_WSTR(unicode)[length] = 0;
547 _PyUnicode_WSTR_LENGTH(unicode) = length;
548 _PyUnicode_CHECK(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 return 0;
550}
551
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552static PyObject*
553resize_copy(PyObject *unicode, Py_ssize_t length)
554{
555 Py_ssize_t copy_length;
556 if (PyUnicode_IS_COMPACT(unicode)) {
557 PyObject *copy;
558 assert(PyUnicode_IS_READY(unicode));
559
560 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
561 if (copy == NULL)
562 return NULL;
563
564 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
565 if (PyUnicode_CopyCharacters(copy, 0,
566 unicode, 0,
567 copy_length) < 0)
568 {
569 Py_DECREF(copy);
570 return NULL;
571 }
572 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200573 }
574 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200575 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200576 assert(_PyUnicode_WSTR(unicode) != NULL);
577 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200578 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 if (w == NULL)
580 return NULL;
581 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
582 copy_length = Py_MIN(copy_length, length);
583 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
584 copy_length);
585 return (PyObject*)w;
586 }
587}
588
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000590 Ux0000 terminated; some code (e.g. new_identifier)
591 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592
593 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000594 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595
596*/
597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598#ifdef Py_DEBUG
599int unicode_old_new_calls = 0;
600#endif
601
Alexander Belopolsky40018472011-02-26 01:02:56 +0000602static PyUnicodeObject *
603_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604{
605 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607
Thomas Wouters477c8d52006-05-27 19:21:47 +0000608 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 if (length == 0 && unicode_empty != NULL) {
610 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200611 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 }
613
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000614 /* Ensure we won't overflow the size. */
615 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
616 return (PyUnicodeObject *)PyErr_NoMemory();
617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618 if (length < 0) {
619 PyErr_SetString(PyExc_SystemError,
620 "Negative size passed to _PyUnicode_New");
621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000622 }
623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624#ifdef Py_DEBUG
625 ++unicode_old_new_calls;
626#endif
627
628 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
629 if (unicode == NULL)
630 return NULL;
631 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
632 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
633 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000634 PyErr_NoMemory();
635 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637
Jeremy Hyltond8082792003-09-16 19:41:39 +0000638 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000639 * the caller fails before initializing str -- unicode_resize()
640 * reads str[0], and the Keep-Alive optimization can keep memory
641 * allocated for str alive across a call to unicode_dealloc(unicode).
642 * We don't want unicode_resize to read uninitialized memory in
643 * that case.
644 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 _PyUnicode_WSTR(unicode)[0] = 0;
646 _PyUnicode_WSTR(unicode)[length] = 0;
647 _PyUnicode_WSTR_LENGTH(unicode) = length;
648 _PyUnicode_HASH(unicode) = -1;
649 _PyUnicode_STATE(unicode).interned = 0;
650 _PyUnicode_STATE(unicode).kind = 0;
651 _PyUnicode_STATE(unicode).compact = 0;
652 _PyUnicode_STATE(unicode).ready = 0;
653 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200654 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200656 _PyUnicode_UTF8(unicode) = NULL;
657 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000659
Benjamin Peterson29060642009-01-31 22:14:21 +0000660 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000661 /* XXX UNREF/NEWREF interface should be more symmetrical */
662 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000663 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000664 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666}
667
Victor Stinnerf42dc442011-10-02 23:33:16 +0200668static const char*
669unicode_kind_name(PyObject *unicode)
670{
Victor Stinner42dfd712011-10-03 14:41:45 +0200671 /* don't check consistency: unicode_kind_name() is called from
672 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200673 if (!PyUnicode_IS_COMPACT(unicode))
674 {
675 if (!PyUnicode_IS_READY(unicode))
676 return "wstr";
677 switch(PyUnicode_KIND(unicode))
678 {
679 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200680 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200681 return "legacy ascii";
682 else
683 return "legacy latin1";
684 case PyUnicode_2BYTE_KIND:
685 return "legacy UCS2";
686 case PyUnicode_4BYTE_KIND:
687 return "legacy UCS4";
688 default:
689 return "<legacy invalid kind>";
690 }
691 }
692 assert(PyUnicode_IS_READY(unicode));
693 switch(PyUnicode_KIND(unicode))
694 {
695 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200696 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200697 return "ascii";
698 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200699 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200700 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200701 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200702 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200703 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200704 default:
705 return "<invalid compact kind>";
706 }
707}
708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709#ifdef Py_DEBUG
710int unicode_new_new_calls = 0;
711
712/* Functions wrapping macros for use in debugger */
713char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200714 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715}
716
717void *_PyUnicode_compact_data(void *unicode) {
718 return _PyUnicode_COMPACT_DATA(unicode);
719}
720void *_PyUnicode_data(void *unicode){
721 printf("obj %p\n", unicode);
722 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
723 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
724 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
725 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
726 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
727 return PyUnicode_DATA(unicode);
728}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200729
730void
731_PyUnicode_Dump(PyObject *op)
732{
733 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200734 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
735 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
736 void *data;
737 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
738 if (ascii->state.compact)
739 data = (compact + 1);
740 else
741 data = unicode->data.any;
742 if (ascii->wstr == data)
743 printf("shared ");
744 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200746 printf(" (%zu), ", compact->wstr_length);
747 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
748 printf("shared ");
749 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200750 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200752}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753#endif
754
755PyObject *
756PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
757{
758 PyObject *obj;
759 PyCompactUnicodeObject *unicode;
760 void *data;
761 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200762 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 Py_ssize_t char_size;
764 Py_ssize_t struct_size;
765
766 /* Optimization for empty strings */
767 if (size == 0 && unicode_empty != NULL) {
768 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200769 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770 }
771
772#ifdef Py_DEBUG
773 ++unicode_new_new_calls;
774#endif
775
Victor Stinner9e9d6892011-10-04 01:02:02 +0200776 is_ascii = 0;
777 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 struct_size = sizeof(PyCompactUnicodeObject);
779 if (maxchar < 128) {
780 kind_state = PyUnicode_1BYTE_KIND;
781 char_size = 1;
782 is_ascii = 1;
783 struct_size = sizeof(PyASCIIObject);
784 }
785 else if (maxchar < 256) {
786 kind_state = PyUnicode_1BYTE_KIND;
787 char_size = 1;
788 }
789 else if (maxchar < 65536) {
790 kind_state = PyUnicode_2BYTE_KIND;
791 char_size = 2;
792 if (sizeof(wchar_t) == 2)
793 is_sharing = 1;
794 }
795 else {
796 kind_state = PyUnicode_4BYTE_KIND;
797 char_size = 4;
798 if (sizeof(wchar_t) == 4)
799 is_sharing = 1;
800 }
801
802 /* Ensure we won't overflow the size. */
803 if (size < 0) {
804 PyErr_SetString(PyExc_SystemError,
805 "Negative size passed to PyUnicode_New");
806 return NULL;
807 }
808 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
809 return PyErr_NoMemory();
810
811 /* Duplicated allocation code from _PyObject_New() instead of a call to
812 * PyObject_New() so we are able to allocate space for the object and
813 * it's data buffer.
814 */
815 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
816 if (obj == NULL)
817 return PyErr_NoMemory();
818 obj = PyObject_INIT(obj, &PyUnicode_Type);
819 if (obj == NULL)
820 return NULL;
821
822 unicode = (PyCompactUnicodeObject *)obj;
823 if (is_ascii)
824 data = ((PyASCIIObject*)obj) + 1;
825 else
826 data = unicode + 1;
827 _PyUnicode_LENGTH(unicode) = size;
828 _PyUnicode_HASH(unicode) = -1;
829 _PyUnicode_STATE(unicode).interned = 0;
830 _PyUnicode_STATE(unicode).kind = kind_state;
831 _PyUnicode_STATE(unicode).compact = 1;
832 _PyUnicode_STATE(unicode).ready = 1;
833 _PyUnicode_STATE(unicode).ascii = is_ascii;
834 if (is_ascii) {
835 ((char*)data)[size] = 0;
836 _PyUnicode_WSTR(unicode) = NULL;
837 }
838 else if (kind_state == PyUnicode_1BYTE_KIND) {
839 ((char*)data)[size] = 0;
840 _PyUnicode_WSTR(unicode) = NULL;
841 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200843 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 }
845 else {
846 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200847 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 if (kind_state == PyUnicode_2BYTE_KIND)
849 ((Py_UCS2*)data)[size] = 0;
850 else /* kind_state == PyUnicode_4BYTE_KIND */
851 ((Py_UCS4*)data)[size] = 0;
852 if (is_sharing) {
853 _PyUnicode_WSTR_LENGTH(unicode) = size;
854 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
855 }
856 else {
857 _PyUnicode_WSTR_LENGTH(unicode) = 0;
858 _PyUnicode_WSTR(unicode) = NULL;
859 }
860 }
861 return obj;
862}
863
864#if SIZEOF_WCHAR_T == 2
865/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
866 will decode surrogate pairs, the other conversions are implemented as macros
867 for efficency.
868
869 This function assumes that unicode can hold one more code point than wstr
870 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200871static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
873 PyUnicodeObject *unicode)
874{
875 const wchar_t *iter;
876 Py_UCS4 *ucs4_out;
877
Victor Stinner910337b2011-10-03 03:20:16 +0200878 assert(unicode != NULL);
879 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
881 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
882
883 for (iter = begin; iter < end; ) {
884 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
885 _PyUnicode_GET_LENGTH(unicode)));
886 if (*iter >= 0xD800 && *iter <= 0xDBFF
887 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
888 {
889 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
890 iter += 2;
891 }
892 else {
893 *ucs4_out++ = *iter;
894 iter++;
895 }
896 }
897 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
898 _PyUnicode_GET_LENGTH(unicode)));
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901#endif
902
Victor Stinnercd9950f2011-10-02 00:34:53 +0200903static int
904_PyUnicode_Dirty(PyObject *unicode)
905{
Victor Stinner910337b2011-10-03 03:20:16 +0200906 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200907 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200908 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200909 "Cannot modify a string having more than 1 reference");
910 return -1;
911 }
912 _PyUnicode_DIRTY(unicode);
913 return 0;
914}
915
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200916Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
918 PyObject *from, Py_ssize_t from_start,
919 Py_ssize_t how_many)
920{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200921 unsigned int from_kind, to_kind;
922 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Victor Stinnerb1536152011-09-30 02:26:10 +0200924 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
925 PyErr_BadInternalCall();
926 return -1;
927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
929 if (PyUnicode_READY(from))
930 return -1;
931 if (PyUnicode_READY(to))
932 return -1;
933
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200934 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200935 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200936 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200937 "Cannot write %zi characters at %zi "
938 "in a string of %zi characters",
939 how_many, to_start, PyUnicode_GET_LENGTH(to));
940 return -1;
941 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200942 if (how_many == 0)
943 return 0;
944
Victor Stinnercd9950f2011-10-02 00:34:53 +0200945 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200946 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200948 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200949 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200951 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 if (from_kind == to_kind
954 /* deny latin1 => ascii */
955 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
956 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200957 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200958 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200959 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200960 + PyUnicode_KIND_SIZE(from_kind, from_start),
961 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200963 else if (from_kind == PyUnicode_1BYTE_KIND
964 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200965 {
966 _PyUnicode_CONVERT_BYTES(
967 Py_UCS1, Py_UCS2,
968 PyUnicode_1BYTE_DATA(from) + from_start,
969 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
970 PyUnicode_2BYTE_DATA(to) + to_start
971 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200972 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200973 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200974 && to_kind == PyUnicode_4BYTE_KIND)
975 {
976 _PyUnicode_CONVERT_BYTES(
977 Py_UCS1, Py_UCS4,
978 PyUnicode_1BYTE_DATA(from) + from_start,
979 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
980 PyUnicode_4BYTE_DATA(to) + to_start
981 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 }
983 else if (from_kind == PyUnicode_2BYTE_KIND
984 && to_kind == PyUnicode_4BYTE_KIND)
985 {
986 _PyUnicode_CONVERT_BYTES(
987 Py_UCS2, Py_UCS4,
988 PyUnicode_2BYTE_DATA(from) + from_start,
989 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
990 PyUnicode_4BYTE_DATA(to) + to_start
991 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200992 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200993 else {
994 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200995
996 /* check if max_char(from substring) <= max_char(to) */
997 if (from_kind > to_kind
998 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200999 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001000 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001001 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001002 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001003 /* slow path to check for character overflow */
1004 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1005 Py_UCS4 ch, maxchar;
1006 Py_ssize_t i;
1007
1008 maxchar = 0;
1009 invalid_kinds = 0;
1010 for (i=0; i < how_many; i++) {
1011 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1012 if (ch > maxchar) {
1013 maxchar = ch;
1014 if (maxchar > to_maxchar) {
1015 invalid_kinds = 1;
1016 break;
1017 }
1018 }
1019 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1020 }
1021 }
1022 else
1023 invalid_kinds = 1;
1024 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001025 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001026 "Cannot copy %s characters "
1027 "into a string of %s characters",
1028 unicode_kind_name(from),
1029 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001030 return -1;
1031 }
1032 }
1033 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034}
1035
Victor Stinner17222162011-09-28 22:15:37 +02001036/* Find the maximum code point and count the number of surrogate pairs so a
1037 correct string length can be computed before converting a string to UCS4.
1038 This function counts single surrogates as a character and not as a pair.
1039
1040 Return 0 on success, or -1 on error. */
1041static int
1042find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1043 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044{
1045 const wchar_t *iter;
1046
Victor Stinnerc53be962011-10-02 21:33:54 +02001047 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 if (num_surrogates == NULL || maxchar == NULL) {
1049 PyErr_SetString(PyExc_SystemError,
1050 "unexpected NULL arguments to "
1051 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1052 return -1;
1053 }
1054
1055 *num_surrogates = 0;
1056 *maxchar = 0;
1057
1058 for (iter = begin; iter < end; ) {
1059 if (*iter > *maxchar)
1060 *maxchar = *iter;
1061#if SIZEOF_WCHAR_T == 2
1062 if (*iter >= 0xD800 && *iter <= 0xDBFF
1063 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1064 {
1065 Py_UCS4 surrogate_val;
1066 surrogate_val = (((iter[0] & 0x3FF)<<10)
1067 | (iter[1] & 0x3FF)) + 0x10000;
1068 ++(*num_surrogates);
1069 if (surrogate_val > *maxchar)
1070 *maxchar = surrogate_val;
1071 iter += 2;
1072 }
1073 else
1074 iter++;
1075#else
1076 iter++;
1077#endif
1078 }
1079 return 0;
1080}
1081
1082#ifdef Py_DEBUG
1083int unicode_ready_calls = 0;
1084#endif
1085
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001086static int
1087unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001089 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 wchar_t *end;
1091 Py_UCS4 maxchar = 0;
1092 Py_ssize_t num_surrogates;
1093#if SIZEOF_WCHAR_T == 2
1094 Py_ssize_t length_wo_surrogates;
1095#endif
1096
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001097 assert(p_obj != NULL);
1098 unicode = (PyUnicodeObject *)*p_obj;
1099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001101 strings were created using _PyObject_New() and where no canonical
1102 representation (the str field) has been set yet aka strings
1103 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001104 assert(_PyUnicode_CHECK(unicode));
1105 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001107 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001108 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001109 /* Actually, it should neither be interned nor be anything else: */
1110 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
1112#ifdef Py_DEBUG
1113 ++unicode_ready_calls;
1114#endif
1115
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001116#ifdef Py_DEBUG
1117 assert(!replace || Py_REFCNT(unicode) == 1);
1118#else
1119 if (replace && Py_REFCNT(unicode) != 1)
1120 replace = 0;
1121#endif
1122 if (replace) {
1123 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1124 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1125 /* Optimization for empty strings */
1126 if (len == 0) {
1127 Py_INCREF(unicode_empty);
1128 Py_DECREF(*p_obj);
1129 *p_obj = unicode_empty;
1130 return 0;
1131 }
1132 if (len == 1 && wstr[0] < 256) {
1133 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1134 if (latin1_char == NULL)
1135 return -1;
1136 Py_DECREF(*p_obj);
1137 *p_obj = latin1_char;
1138 return 0;
1139 }
1140 }
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001143 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001144 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
1147 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001148 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1149 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 PyErr_NoMemory();
1151 return -1;
1152 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001153 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154 _PyUnicode_WSTR(unicode), end,
1155 PyUnicode_1BYTE_DATA(unicode));
1156 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1157 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1158 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1159 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001161 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001162 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 }
1164 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001165 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001166 _PyUnicode_UTF8(unicode) = NULL;
1167 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 }
1169 PyObject_FREE(_PyUnicode_WSTR(unicode));
1170 _PyUnicode_WSTR(unicode) = NULL;
1171 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1172 }
1173 /* In this case we might have to convert down from 4-byte native
1174 wchar_t to 2-byte unicode. */
1175 else if (maxchar < 65536) {
1176 assert(num_surrogates == 0 &&
1177 "FindMaxCharAndNumSurrogatePairs() messed up");
1178
Victor Stinner506f5922011-09-28 22:34:18 +02001179#if SIZEOF_WCHAR_T == 2
1180 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001181 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001182 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1183 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1184 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001185 _PyUnicode_UTF8(unicode) = NULL;
1186 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001187#else
1188 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001189 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001190 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001191 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001192 PyErr_NoMemory();
1193 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 }
Victor Stinner506f5922011-09-28 22:34:18 +02001195 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1196 _PyUnicode_WSTR(unicode), end,
1197 PyUnicode_2BYTE_DATA(unicode));
1198 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1199 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1200 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001201 _PyUnicode_UTF8(unicode) = NULL;
1202 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001203 PyObject_FREE(_PyUnicode_WSTR(unicode));
1204 _PyUnicode_WSTR(unicode) = NULL;
1205 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1206#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
1208 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1209 else {
1210#if SIZEOF_WCHAR_T == 2
1211 /* in case the native representation is 2-bytes, we need to allocate a
1212 new normalized 4-byte version. */
1213 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001214 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1215 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 PyErr_NoMemory();
1217 return -1;
1218 }
1219 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1220 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001223 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1224 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001225 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 PyObject_FREE(_PyUnicode_WSTR(unicode));
1227 _PyUnicode_WSTR(unicode) = NULL;
1228 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1229#else
1230 assert(num_surrogates == 0);
1231
Victor Stinnerc3c74152011-10-02 20:39:55 +02001232 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001234 _PyUnicode_UTF8(unicode) = NULL;
1235 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1237#endif
1238 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1239 }
1240 _PyUnicode_STATE(unicode).ready = 1;
1241 return 0;
1242}
1243
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001244int
1245_PyUnicode_ReadyReplace(PyObject **op)
1246{
1247 return unicode_ready(op, 1);
1248}
1249
1250int
1251_PyUnicode_Ready(PyObject *op)
1252{
1253 return unicode_ready(&op, 0);
1254}
1255
Alexander Belopolsky40018472011-02-26 01:02:56 +00001256static void
1257unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258{
Walter Dörwald16807132007-05-25 13:52:07 +00001259 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001260 case SSTATE_NOT_INTERNED:
1261 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001262
Benjamin Peterson29060642009-01-31 22:14:21 +00001263 case SSTATE_INTERNED_MORTAL:
1264 /* revive dead object temporarily for DelItem */
1265 Py_REFCNT(unicode) = 3;
1266 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1267 Py_FatalError(
1268 "deletion of interned string failed");
1269 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001270
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 case SSTATE_INTERNED_IMMORTAL:
1272 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001273
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 default:
1275 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001276 }
1277
Victor Stinner03490912011-10-03 23:45:12 +02001278 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001280 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001281 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282
1283 if (PyUnicode_IS_COMPACT(unicode)) {
1284 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285 }
1286 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001287 if (_PyUnicode_DATA_ANY(unicode))
1288 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001289 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 }
1291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001294unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001295{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001296 if (Py_REFCNT(unicode) != 1)
1297 return 0;
1298 if (PyUnicode_CHECK_INTERNED(unicode))
1299 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001300 assert (unicode != unicode_empty);
1301#ifdef Py_DEBUG
1302 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1303 && PyUnicode_GET_LENGTH(unicode) == 1)
1304 {
1305 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001306 if (ch < 256 && unicode_latin1[ch] == unicode)
1307 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001308 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001309#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001310 return 1;
1311}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313static int
1314unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1315{
1316 PyObject *unicode;
1317 Py_ssize_t old_length;
1318
1319 assert(p_unicode != NULL);
1320 unicode = *p_unicode;
1321
1322 assert(unicode != NULL);
1323 assert(PyUnicode_Check(unicode));
1324 assert(0 <= length);
1325
Victor Stinner910337b2011-10-03 03:20:16 +02001326 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 old_length = PyUnicode_WSTR_LENGTH(unicode);
1328 else
1329 old_length = PyUnicode_GET_LENGTH(unicode);
1330 if (old_length == length)
1331 return 0;
1332
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 if (!unicode_resizable(unicode)) {
1334 PyObject *copy = resize_copy(unicode, length);
1335 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001337 Py_DECREF(*p_unicode);
1338 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001339 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001340 }
1341
Victor Stinnerfe226c02011-10-03 03:52:20 +02001342 if (PyUnicode_IS_COMPACT(unicode)) {
1343 *p_unicode = resize_compact(unicode, length);
1344 if (*p_unicode == NULL)
1345 return -1;
Victor Stinner95663112011-10-04 01:03:50 +02001346 _PyUnicode_CHECK(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001347 return 0;
1348 } else
1349 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001350}
1351
Alexander Belopolsky40018472011-02-26 01:02:56 +00001352int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001353PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001354{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001355 PyObject *unicode;
1356 if (p_unicode == NULL) {
1357 PyErr_BadInternalCall();
1358 return -1;
1359 }
1360 unicode = *p_unicode;
1361 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1362 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1363 {
1364 PyErr_BadInternalCall();
1365 return -1;
1366 }
1367 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001368}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370static PyObject*
1371get_latin1_char(unsigned char ch)
1372{
Victor Stinnera464fc12011-10-02 20:39:30 +02001373 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001375 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 if (!unicode)
1377 return NULL;
1378 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1379 unicode_latin1[ch] = unicode;
1380 }
1381 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001382 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383}
1384
Alexander Belopolsky40018472011-02-26 01:02:56 +00001385PyObject *
1386PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387{
1388 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 Py_UCS4 maxchar = 0;
1390 Py_ssize_t num_surrogates;
1391
1392 if (u == NULL)
1393 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001395 /* If the Unicode data is known at construction time, we can apply
1396 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 /* Optimization for empty strings */
1399 if (size == 0 && unicode_empty != NULL) {
1400 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001401 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001402 }
Tim Petersced69f82003-09-16 20:30:58 +00001403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 /* Single character Unicode objects in the Latin-1 range are
1405 shared when using this constructor */
1406 if (size == 1 && *u < 256)
1407 return get_latin1_char((unsigned char)*u);
1408
1409 /* If not empty and not single character, copy the Unicode data
1410 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001411 if (find_maxchar_surrogates(u, u + size,
1412 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 return NULL;
1414
1415 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1416 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 if (!unicode)
1418 return NULL;
1419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 switch (PyUnicode_KIND(unicode)) {
1421 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001422 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1424 break;
1425 case PyUnicode_2BYTE_KIND:
1426#if Py_UNICODE_SIZE == 2
1427 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1428#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001429 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1431#endif
1432 break;
1433 case PyUnicode_4BYTE_KIND:
1434#if SIZEOF_WCHAR_T == 2
1435 /* This is the only case which has to process surrogates, thus
1436 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001437 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438#else
1439 assert(num_surrogates == 0);
1440 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1441#endif
1442 break;
1443 default:
1444 assert(0 && "Impossible state");
1445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446
1447 return (PyObject *)unicode;
1448}
1449
Alexander Belopolsky40018472011-02-26 01:02:56 +00001450PyObject *
1451PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001452{
1453 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001454
Benjamin Peterson14339b62009-01-31 16:36:08 +00001455 if (size < 0) {
1456 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001458 return NULL;
1459 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001460
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001461 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001462 some optimizations which share commonly used objects.
1463 Also, this means the input must be UTF-8, so fall back to the
1464 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001465 if (u != NULL) {
1466
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 /* Optimization for empty strings */
1468 if (size == 0 && unicode_empty != NULL) {
1469 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001470 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001472
1473 /* Single characters are shared when using this constructor.
1474 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 if (size == 1 && Py_CHARMASK(*u) < 128)
1476 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001477
1478 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001479 }
1480
Walter Dörwald55507312007-05-18 13:12:10 +00001481 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (!unicode)
1483 return NULL;
1484
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001485 return (PyObject *)unicode;
1486}
1487
Alexander Belopolsky40018472011-02-26 01:02:56 +00001488PyObject *
1489PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001490{
1491 size_t size = strlen(u);
1492 if (size > PY_SSIZE_T_MAX) {
1493 PyErr_SetString(PyExc_OverflowError, "input too long");
1494 return NULL;
1495 }
1496
1497 return PyUnicode_FromStringAndSize(u, size);
1498}
1499
Victor Stinnere57b1c02011-09-28 22:20:48 +02001500static PyObject*
1501_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 PyObject *res;
1504 unsigned char max = 127;
1505 Py_ssize_t i;
1506 for (i = 0; i < size; i++) {
1507 if (u[i] & 0x80) {
1508 max = 255;
1509 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001510 }
1511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 res = PyUnicode_New(size, max);
1513 if (!res)
1514 return NULL;
1515 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1516 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001517}
1518
Victor Stinnere57b1c02011-09-28 22:20:48 +02001519static PyObject*
1520_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521{
1522 PyObject *res;
1523 Py_UCS2 max = 0;
1524 Py_ssize_t i;
1525 for (i = 0; i < size; i++)
1526 if (u[i] > max)
1527 max = u[i];
1528 res = PyUnicode_New(size, max);
1529 if (!res)
1530 return NULL;
1531 if (max >= 256)
1532 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1533 else
1534 for (i = 0; i < size; i++)
1535 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1536 return res;
1537}
1538
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539static PyObject*
1540_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001541{
1542 PyObject *res;
1543 Py_UCS4 max = 0;
1544 Py_ssize_t i;
1545 for (i = 0; i < size; i++)
1546 if (u[i] > max)
1547 max = u[i];
1548 res = PyUnicode_New(size, max);
1549 if (!res)
1550 return NULL;
1551 if (max >= 0x10000)
1552 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1553 else {
1554 int kind = PyUnicode_KIND(res);
1555 void *data = PyUnicode_DATA(res);
1556 for (i = 0; i < size; i++)
1557 PyUnicode_WRITE(kind, data, i, u[i]);
1558 }
1559 return res;
1560}
1561
1562PyObject*
1563PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1564{
1565 switch(kind) {
1566 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001567 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001569 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001571 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 }
Victor Stinner01698042011-10-04 00:04:26 +02001573 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 return NULL;
1575}
1576
Victor Stinner034f6cf2011-09-30 02:26:44 +02001577PyObject*
1578PyUnicode_Copy(PyObject *unicode)
1579{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001580 Py_ssize_t size;
1581 PyObject *copy;
1582 void *data;
1583
Victor Stinner034f6cf2011-09-30 02:26:44 +02001584 if (!PyUnicode_Check(unicode)) {
1585 PyErr_BadInternalCall();
1586 return NULL;
1587 }
1588 if (PyUnicode_READY(unicode))
1589 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001590
1591 size = PyUnicode_GET_LENGTH(unicode);
1592 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1593 if (!copy)
1594 return NULL;
1595 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1596
1597 data = PyUnicode_DATA(unicode);
1598 switch (PyUnicode_KIND(unicode))
1599 {
1600 case PyUnicode_1BYTE_KIND:
1601 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1602 break;
1603 case PyUnicode_2BYTE_KIND:
1604 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1605 break;
1606 case PyUnicode_4BYTE_KIND:
1607 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1608 break;
1609 default:
1610 assert(0);
1611 break;
1612 }
1613 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001614}
1615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616
Victor Stinnerbc603d12011-10-02 01:00:40 +02001617/* Widen Unicode objects to larger buffers. Don't write terminating null
1618 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619
1620void*
1621_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1622{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001623 Py_ssize_t len;
1624 void *result;
1625 unsigned int skind;
1626
1627 if (PyUnicode_READY(s))
1628 return NULL;
1629
1630 len = PyUnicode_GET_LENGTH(s);
1631 skind = PyUnicode_KIND(s);
1632 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001633 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634 return NULL;
1635 }
1636 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001637 case PyUnicode_2BYTE_KIND:
1638 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1639 if (!result)
1640 return PyErr_NoMemory();
1641 assert(skind == PyUnicode_1BYTE_KIND);
1642 _PyUnicode_CONVERT_BYTES(
1643 Py_UCS1, Py_UCS2,
1644 PyUnicode_1BYTE_DATA(s),
1645 PyUnicode_1BYTE_DATA(s) + len,
1646 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001648 case PyUnicode_4BYTE_KIND:
1649 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1650 if (!result)
1651 return PyErr_NoMemory();
1652 if (skind == PyUnicode_2BYTE_KIND) {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS4,
1655 PyUnicode_2BYTE_DATA(s),
1656 PyUnicode_2BYTE_DATA(s) + len,
1657 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001659 else {
1660 assert(skind == PyUnicode_1BYTE_KIND);
1661 _PyUnicode_CONVERT_BYTES(
1662 Py_UCS1, Py_UCS4,
1663 PyUnicode_1BYTE_DATA(s),
1664 PyUnicode_1BYTE_DATA(s) + len,
1665 result);
1666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001668 default:
1669 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 }
Victor Stinner01698042011-10-04 00:04:26 +02001671 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 return NULL;
1673}
1674
1675static Py_UCS4*
1676as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1677 int copy_null)
1678{
1679 int kind;
1680 void *data;
1681 Py_ssize_t len, targetlen;
1682 if (PyUnicode_READY(string) == -1)
1683 return NULL;
1684 kind = PyUnicode_KIND(string);
1685 data = PyUnicode_DATA(string);
1686 len = PyUnicode_GET_LENGTH(string);
1687 targetlen = len;
1688 if (copy_null)
1689 targetlen++;
1690 if (!target) {
1691 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1692 PyErr_NoMemory();
1693 return NULL;
1694 }
1695 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1696 if (!target) {
1697 PyErr_NoMemory();
1698 return NULL;
1699 }
1700 }
1701 else {
1702 if (targetsize < targetlen) {
1703 PyErr_Format(PyExc_SystemError,
1704 "string is longer than the buffer");
1705 if (copy_null && 0 < targetsize)
1706 target[0] = 0;
1707 return NULL;
1708 }
1709 }
1710 if (kind != PyUnicode_4BYTE_KIND) {
1711 Py_ssize_t i;
1712 for (i = 0; i < len; i++)
1713 target[i] = PyUnicode_READ(kind, data, i);
1714 }
1715 else
1716 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1717 if (copy_null)
1718 target[len] = 0;
1719 return target;
1720}
1721
1722Py_UCS4*
1723PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1724 int copy_null)
1725{
1726 if (target == NULL || targetsize < 1) {
1727 PyErr_BadInternalCall();
1728 return NULL;
1729 }
1730 return as_ucs4(string, target, targetsize, copy_null);
1731}
1732
1733Py_UCS4*
1734PyUnicode_AsUCS4Copy(PyObject *string)
1735{
1736 return as_ucs4(string, NULL, 0, 1);
1737}
1738
1739#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001740
Alexander Belopolsky40018472011-02-26 01:02:56 +00001741PyObject *
1742PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001745 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001747 PyErr_BadInternalCall();
1748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 }
1750
Martin v. Löwis790465f2008-04-05 20:41:37 +00001751 if (size == -1) {
1752 size = wcslen(w);
1753 }
1754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756}
1757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001759
Walter Dörwald346737f2007-05-31 10:44:43 +00001760static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001761makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1762 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 *fmt++ = '%';
1765 if (width) {
1766 if (zeropad)
1767 *fmt++ = '0';
1768 fmt += sprintf(fmt, "%d", width);
1769 }
1770 if (precision)
1771 fmt += sprintf(fmt, ".%d", precision);
1772 if (longflag)
1773 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001774 else if (longlongflag) {
1775 /* longlongflag should only ever be nonzero on machines with
1776 HAVE_LONG_LONG defined */
1777#ifdef HAVE_LONG_LONG
1778 char *f = PY_FORMAT_LONG_LONG;
1779 while (*f)
1780 *fmt++ = *f++;
1781#else
1782 /* we shouldn't ever get here */
1783 assert(0);
1784 *fmt++ = 'l';
1785#endif
1786 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 else if (size_tflag) {
1788 char *f = PY_FORMAT_SIZE_T;
1789 while (*f)
1790 *fmt++ = *f++;
1791 }
1792 *fmt++ = c;
1793 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001794}
1795
Victor Stinner96865452011-03-01 23:44:09 +00001796/* helper for PyUnicode_FromFormatV() */
1797
1798static const char*
1799parse_format_flags(const char *f,
1800 int *p_width, int *p_precision,
1801 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1802{
1803 int width, precision, longflag, longlongflag, size_tflag;
1804
1805 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1806 f++;
1807 width = 0;
1808 while (Py_ISDIGIT((unsigned)*f))
1809 width = (width*10) + *f++ - '0';
1810 precision = 0;
1811 if (*f == '.') {
1812 f++;
1813 while (Py_ISDIGIT((unsigned)*f))
1814 precision = (precision*10) + *f++ - '0';
1815 if (*f == '%') {
1816 /* "%.3%s" => f points to "3" */
1817 f--;
1818 }
1819 }
1820 if (*f == '\0') {
1821 /* bogus format "%.1" => go backward, f points to "1" */
1822 f--;
1823 }
1824 if (p_width != NULL)
1825 *p_width = width;
1826 if (p_precision != NULL)
1827 *p_precision = precision;
1828
1829 /* Handle %ld, %lu, %lld and %llu. */
1830 longflag = 0;
1831 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001832 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001833
1834 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001835 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001836 longflag = 1;
1837 ++f;
1838 }
1839#ifdef HAVE_LONG_LONG
1840 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001841 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001842 longlongflag = 1;
1843 f += 2;
1844 }
1845#endif
1846 }
1847 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001848 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001849 size_tflag = 1;
1850 ++f;
1851 }
1852 if (p_longflag != NULL)
1853 *p_longflag = longflag;
1854 if (p_longlongflag != NULL)
1855 *p_longlongflag = longlongflag;
1856 if (p_size_tflag != NULL)
1857 *p_size_tflag = size_tflag;
1858 return f;
1859}
1860
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001861/* maximum number of characters required for output of %ld. 21 characters
1862 allows for 64-bit integers (in decimal) and an optional sign. */
1863#define MAX_LONG_CHARS 21
1864/* maximum number of characters required for output of %lld.
1865 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1866 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1867#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1868
Walter Dörwaldd2034312007-05-18 16:29:38 +00001869PyObject *
1870PyUnicode_FromFormatV(const char *format, va_list vargs)
1871{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 va_list count;
1873 Py_ssize_t callcount = 0;
1874 PyObject **callresults = NULL;
1875 PyObject **callresult = NULL;
1876 Py_ssize_t n = 0;
1877 int width = 0;
1878 int precision = 0;
1879 int zeropad;
1880 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001882 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001883 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1885 Py_UCS4 argmaxchar;
1886 Py_ssize_t numbersize = 0;
1887 char *numberresults = NULL;
1888 char *numberresult = NULL;
1889 Py_ssize_t i;
1890 int kind;
1891 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001892
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001893 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001894 /* step 1: count the number of %S/%R/%A/%s format specifications
1895 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1896 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 * result in an array)
1898 * also esimate a upper bound for all the number formats in the string,
1899 * numbers will be formated in step 3 and be keept in a '\0'-separated
1900 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001901 for (f = format; *f; f++) {
1902 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001903 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1905 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1906 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1907 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001910#ifdef HAVE_LONG_LONG
1911 if (longlongflag) {
1912 if (width < MAX_LONG_LONG_CHARS)
1913 width = MAX_LONG_LONG_CHARS;
1914 }
1915 else
1916#endif
1917 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1918 including sign. Decimal takes the most space. This
1919 isn't enough for octal. If a width is specified we
1920 need more (which we allocate later). */
1921 if (width < MAX_LONG_CHARS)
1922 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923
1924 /* account for the size + '\0' to separate numbers
1925 inside of the numberresults buffer */
1926 numbersize += (width + 1);
1927 }
1928 }
1929 else if ((unsigned char)*f > 127) {
1930 PyErr_Format(PyExc_ValueError,
1931 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1932 "string, got a non-ASCII byte: 0x%02x",
1933 (unsigned char)*f);
1934 return NULL;
1935 }
1936 }
1937 /* step 2: allocate memory for the results of
1938 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1939 if (callcount) {
1940 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1941 if (!callresults) {
1942 PyErr_NoMemory();
1943 return NULL;
1944 }
1945 callresult = callresults;
1946 }
1947 /* step 2.5: allocate memory for the results of formating numbers */
1948 if (numbersize) {
1949 numberresults = PyObject_Malloc(numbersize);
1950 if (!numberresults) {
1951 PyErr_NoMemory();
1952 goto fail;
1953 }
1954 numberresult = numberresults;
1955 }
1956
1957 /* step 3: format numbers and figure out how large a buffer we need */
1958 for (f = format; *f; f++) {
1959 if (*f == '%') {
1960 const char* p;
1961 int longflag;
1962 int longlongflag;
1963 int size_tflag;
1964 int numprinted;
1965
1966 p = f;
1967 zeropad = (f[1] == '0');
1968 f = parse_format_flags(f, &width, &precision,
1969 &longflag, &longlongflag, &size_tflag);
1970 switch (*f) {
1971 case 'c':
1972 {
1973 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001974 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 n++;
1976 break;
1977 }
1978 case '%':
1979 n++;
1980 break;
1981 case 'i':
1982 case 'd':
1983 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1984 width, precision, *f);
1985 if (longflag)
1986 numprinted = sprintf(numberresult, fmt,
1987 va_arg(count, long));
1988#ifdef HAVE_LONG_LONG
1989 else if (longlongflag)
1990 numprinted = sprintf(numberresult, fmt,
1991 va_arg(count, PY_LONG_LONG));
1992#endif
1993 else if (size_tflag)
1994 numprinted = sprintf(numberresult, fmt,
1995 va_arg(count, Py_ssize_t));
1996 else
1997 numprinted = sprintf(numberresult, fmt,
1998 va_arg(count, int));
1999 n += numprinted;
2000 /* advance by +1 to skip over the '\0' */
2001 numberresult += (numprinted + 1);
2002 assert(*(numberresult - 1) == '\0');
2003 assert(*(numberresult - 2) != '\0');
2004 assert(numprinted >= 0);
2005 assert(numberresult <= numberresults + numbersize);
2006 break;
2007 case 'u':
2008 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2009 width, precision, 'u');
2010 if (longflag)
2011 numprinted = sprintf(numberresult, fmt,
2012 va_arg(count, unsigned long));
2013#ifdef HAVE_LONG_LONG
2014 else if (longlongflag)
2015 numprinted = sprintf(numberresult, fmt,
2016 va_arg(count, unsigned PY_LONG_LONG));
2017#endif
2018 else if (size_tflag)
2019 numprinted = sprintf(numberresult, fmt,
2020 va_arg(count, size_t));
2021 else
2022 numprinted = sprintf(numberresult, fmt,
2023 va_arg(count, unsigned int));
2024 n += numprinted;
2025 numberresult += (numprinted + 1);
2026 assert(*(numberresult - 1) == '\0');
2027 assert(*(numberresult - 2) != '\0');
2028 assert(numprinted >= 0);
2029 assert(numberresult <= numberresults + numbersize);
2030 break;
2031 case 'x':
2032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2033 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2034 n += numprinted;
2035 numberresult += (numprinted + 1);
2036 assert(*(numberresult - 1) == '\0');
2037 assert(*(numberresult - 2) != '\0');
2038 assert(numprinted >= 0);
2039 assert(numberresult <= numberresults + numbersize);
2040 break;
2041 case 'p':
2042 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2043 /* %p is ill-defined: ensure leading 0x. */
2044 if (numberresult[1] == 'X')
2045 numberresult[1] = 'x';
2046 else if (numberresult[1] != 'x') {
2047 memmove(numberresult + 2, numberresult,
2048 strlen(numberresult) + 1);
2049 numberresult[0] = '0';
2050 numberresult[1] = 'x';
2051 numprinted += 2;
2052 }
2053 n += numprinted;
2054 numberresult += (numprinted + 1);
2055 assert(*(numberresult - 1) == '\0');
2056 assert(*(numberresult - 2) != '\0');
2057 assert(numprinted >= 0);
2058 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002059 break;
2060 case 's':
2061 {
2062 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002063 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002064 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2065 if (!str)
2066 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 /* since PyUnicode_DecodeUTF8 returns already flexible
2068 unicode objects, there is no need to call ready on them */
2069 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002070 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002072 /* Remember the str and switch to the next slot */
2073 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 break;
2075 }
2076 case 'U':
2077 {
2078 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002079 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 if (PyUnicode_READY(obj) == -1)
2081 goto fail;
2082 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002083 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002085 break;
2086 }
2087 case 'V':
2088 {
2089 PyObject *obj = va_arg(count, PyObject *);
2090 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002091 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002093 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002094 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 if (PyUnicode_READY(obj) == -1)
2096 goto fail;
2097 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002098 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002100 *callresult++ = NULL;
2101 }
2102 else {
2103 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2104 if (!str_obj)
2105 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002107 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002109 *callresult++ = str_obj;
2110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002111 break;
2112 }
2113 case 'S':
2114 {
2115 PyObject *obj = va_arg(count, PyObject *);
2116 PyObject *str;
2117 assert(obj);
2118 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002120 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002122 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002124 /* Remember the str and switch to the next slot */
2125 *callresult++ = str;
2126 break;
2127 }
2128 case 'R':
2129 {
2130 PyObject *obj = va_arg(count, PyObject *);
2131 PyObject *repr;
2132 assert(obj);
2133 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002135 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002137 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002139 /* Remember the repr and switch to the next slot */
2140 *callresult++ = repr;
2141 break;
2142 }
2143 case 'A':
2144 {
2145 PyObject *obj = va_arg(count, PyObject *);
2146 PyObject *ascii;
2147 assert(obj);
2148 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002150 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002152 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 /* Remember the repr and switch to the next slot */
2155 *callresult++ = ascii;
2156 break;
2157 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002158 default:
2159 /* if we stumble upon an unknown
2160 formatting code, copy the rest of
2161 the format string to the output
2162 string. (we cannot just skip the
2163 code, since there's no way to know
2164 what's in the argument list) */
2165 n += strlen(p);
2166 goto expand;
2167 }
2168 } else
2169 n++;
2170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002171 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002172 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 we don't have to resize the string.
2175 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002177 if (!string)
2178 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 kind = PyUnicode_KIND(string);
2180 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002186 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002187
2188 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2190 /* checking for == because the last argument could be a empty
2191 string, which causes i to point to end, the assert at the end of
2192 the loop */
2193 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002194
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 switch (*f) {
2196 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002197 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 const int ordinal = va_arg(vargs, int);
2199 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002201 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002202 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002204 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002205 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 case 'p':
2207 /* unused, since we already have the result */
2208 if (*f == 'p')
2209 (void) va_arg(vargs, void *);
2210 else
2211 (void) va_arg(vargs, int);
2212 /* extract the result from numberresults and append. */
2213 for (; *numberresult; ++i, ++numberresult)
2214 PyUnicode_WRITE(kind, data, i, *numberresult);
2215 /* skip over the separating '\0' */
2216 assert(*numberresult == '\0');
2217 numberresult++;
2218 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 break;
2220 case 's':
2221 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002222 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002224 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 size = PyUnicode_GET_LENGTH(*callresult);
2226 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002227 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2228 *callresult, 0,
2229 size) < 0)
2230 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002232 /* We're done with the unicode()/repr() => forget it */
2233 Py_DECREF(*callresult);
2234 /* switch to next unicode()/repr() result */
2235 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 break;
2237 }
2238 case 'U':
2239 {
2240 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 Py_ssize_t size;
2242 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2243 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002244 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2245 obj, 0,
2246 size) < 0)
2247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 break;
2250 }
2251 case 'V':
2252 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002255 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 size = PyUnicode_GET_LENGTH(obj);
2258 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002259 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2260 obj, 0,
2261 size) < 0)
2262 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 size = PyUnicode_GET_LENGTH(*callresult);
2266 assert(PyUnicode_KIND(*callresult) <=
2267 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002268 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2269 *callresult,
2270 0, size) < 0)
2271 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002273 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002275 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002276 break;
2277 }
2278 case 'S':
2279 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002280 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 /* unused, since we already have the result */
2283 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002285 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2286 *callresult, 0,
2287 PyUnicode_GET_LENGTH(*callresult)) < 0)
2288 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002290 /* We're done with the unicode()/repr() => forget it */
2291 Py_DECREF(*callresult);
2292 /* switch to next unicode()/repr() result */
2293 ++callresult;
2294 break;
2295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002296 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 break;
2299 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300 for (; *p; ++p, ++i)
2301 PyUnicode_WRITE(kind, data, i, *p);
2302 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002303 goto end;
2304 }
Victor Stinner1205f272010-09-11 00:54:47 +00002305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 else {
2307 assert(i < PyUnicode_GET_LENGTH(string));
2308 PyUnicode_WRITE(kind, data, i++, *f);
2309 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002312
Benjamin Peterson29060642009-01-31 22:14:21 +00002313 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 if (callresults)
2315 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 if (numberresults)
2317 PyObject_Free(numberresults);
2318 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002319 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 if (callresults) {
2321 PyObject **callresult2 = callresults;
2322 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002323 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 ++callresult2;
2325 }
2326 PyObject_Free(callresults);
2327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 if (numberresults)
2329 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002330 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002331}
2332
Walter Dörwaldd2034312007-05-18 16:29:38 +00002333PyObject *
2334PyUnicode_FromFormat(const char *format, ...)
2335{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 PyObject* ret;
2337 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002338
2339#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002341#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002342 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002343#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002344 ret = PyUnicode_FromFormatV(format, vargs);
2345 va_end(vargs);
2346 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002347}
2348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349#ifdef HAVE_WCHAR_H
2350
Victor Stinner5593d8a2010-10-02 11:11:27 +00002351/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2352 convert a Unicode object to a wide character string.
2353
Victor Stinnerd88d9832011-09-06 02:00:05 +02002354 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002355 character) required to convert the unicode object. Ignore size argument.
2356
Victor Stinnerd88d9832011-09-06 02:00:05 +02002357 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002358 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002359 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002360static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002361unicode_aswidechar(PyUnicodeObject *unicode,
2362 wchar_t *w,
2363 Py_ssize_t size)
2364{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002365 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366 const wchar_t *wstr;
2367
2368 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2369 if (wstr == NULL)
2370 return -1;
2371
Victor Stinner5593d8a2010-10-02 11:11:27 +00002372 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002373 if (size > res)
2374 size = res + 1;
2375 else
2376 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002378 return res;
2379 }
2380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002382}
2383
2384Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002385PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002386 wchar_t *w,
2387 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388{
2389 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002390 PyErr_BadInternalCall();
2391 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002393 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394}
2395
Victor Stinner137c34c2010-09-29 10:25:54 +00002396wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002397PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002398 Py_ssize_t *size)
2399{
2400 wchar_t* buffer;
2401 Py_ssize_t buflen;
2402
2403 if (unicode == NULL) {
2404 PyErr_BadInternalCall();
2405 return NULL;
2406 }
2407
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002408 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 if (buflen == -1)
2410 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002411 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002412 PyErr_NoMemory();
2413 return NULL;
2414 }
2415
Victor Stinner137c34c2010-09-29 10:25:54 +00002416 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2417 if (buffer == NULL) {
2418 PyErr_NoMemory();
2419 return NULL;
2420 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002421 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 if (buflen == -1)
2423 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002424 if (size != NULL)
2425 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002426 return buffer;
2427}
2428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430
Alexander Belopolsky40018472011-02-26 01:02:56 +00002431PyObject *
2432PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002435 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 PyErr_SetString(PyExc_ValueError,
2437 "chr() arg not in range(0x110000)");
2438 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002439 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 if (ordinal < 256)
2442 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 v = PyUnicode_New(1, ordinal);
2445 if (v == NULL)
2446 return NULL;
2447 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2448 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002449}
2450
Alexander Belopolsky40018472011-02-26 01:02:56 +00002451PyObject *
2452PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002454 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002455 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002456 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002457 if (PyUnicode_READY(obj))
2458 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 Py_INCREF(obj);
2460 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002461 }
2462 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 /* For a Unicode subtype that's not a Unicode object,
2464 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002465 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002466 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002467 PyErr_Format(PyExc_TypeError,
2468 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002469 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002470 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002471}
2472
Alexander Belopolsky40018472011-02-26 01:02:56 +00002473PyObject *
2474PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002475 const char *encoding,
2476 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002477{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002478 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002479 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002482 PyErr_BadInternalCall();
2483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002485
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002486 /* Decoding bytes objects is the most common case and should be fast */
2487 if (PyBytes_Check(obj)) {
2488 if (PyBytes_GET_SIZE(obj) == 0) {
2489 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002490 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002491 }
2492 else {
2493 v = PyUnicode_Decode(
2494 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2495 encoding, errors);
2496 }
2497 return v;
2498 }
2499
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002500 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002501 PyErr_SetString(PyExc_TypeError,
2502 "decoding str is not supported");
2503 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002505
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002506 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2507 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2508 PyErr_Format(PyExc_TypeError,
2509 "coercing to str: need bytes, bytearray "
2510 "or buffer-like object, %.80s found",
2511 Py_TYPE(obj)->tp_name);
2512 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002513 }
Tim Petersced69f82003-09-16 20:30:58 +00002514
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002515 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002516 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002517 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 }
Tim Petersced69f82003-09-16 20:30:58 +00002519 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002520 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002521
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002522 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002523 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524}
2525
Victor Stinner600d3be2010-06-10 12:00:55 +00002526/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002527 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2528 1 on success. */
2529static int
2530normalize_encoding(const char *encoding,
2531 char *lower,
2532 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002534 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002535 char *l;
2536 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002538 e = encoding;
2539 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002540 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002541 while (*e) {
2542 if (l == l_end)
2543 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002544 if (Py_ISUPPER(*e)) {
2545 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002546 }
2547 else if (*e == '_') {
2548 *l++ = '-';
2549 e++;
2550 }
2551 else {
2552 *l++ = *e++;
2553 }
2554 }
2555 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002556 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002557}
2558
Alexander Belopolsky40018472011-02-26 01:02:56 +00002559PyObject *
2560PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002561 Py_ssize_t size,
2562 const char *encoding,
2563 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002564{
2565 PyObject *buffer = NULL, *unicode;
2566 Py_buffer info;
2567 char lower[11]; /* Enough for any encoding shortcut */
2568
2569 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002570 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002571
2572 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002573 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002574 if ((strcmp(lower, "utf-8") == 0) ||
2575 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002576 return PyUnicode_DecodeUTF8(s, size, errors);
2577 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002578 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002579 (strcmp(lower, "iso-8859-1") == 0))
2580 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002581#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002582 else if (strcmp(lower, "mbcs") == 0)
2583 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002584#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002585 else if (strcmp(lower, "ascii") == 0)
2586 return PyUnicode_DecodeASCII(s, size, errors);
2587 else if (strcmp(lower, "utf-16") == 0)
2588 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2589 else if (strcmp(lower, "utf-32") == 0)
2590 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002594 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002595 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002596 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002597 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 if (buffer == NULL)
2599 goto onError;
2600 unicode = PyCodec_Decode(buffer, encoding, errors);
2601 if (unicode == NULL)
2602 goto onError;
2603 if (!PyUnicode_Check(unicode)) {
2604 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002605 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002606 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 Py_DECREF(unicode);
2608 goto onError;
2609 }
2610 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002611 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 Py_DECREF(unicode);
2613 return NULL;
2614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002616
Benjamin Peterson29060642009-01-31 22:14:21 +00002617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 Py_XDECREF(buffer);
2619 return NULL;
2620}
2621
Alexander Belopolsky40018472011-02-26 01:02:56 +00002622PyObject *
2623PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002624 const char *encoding,
2625 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002626{
2627 PyObject *v;
2628
2629 if (!PyUnicode_Check(unicode)) {
2630 PyErr_BadArgument();
2631 goto onError;
2632 }
2633
2634 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002635 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002636
2637 /* Decode via the codec registry */
2638 v = PyCodec_Decode(unicode, encoding, errors);
2639 if (v == NULL)
2640 goto onError;
2641 return v;
2642
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002644 return NULL;
2645}
2646
Alexander Belopolsky40018472011-02-26 01:02:56 +00002647PyObject *
2648PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002649 const char *encoding,
2650 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002651{
2652 PyObject *v;
2653
2654 if (!PyUnicode_Check(unicode)) {
2655 PyErr_BadArgument();
2656 goto onError;
2657 }
2658
2659 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002661
2662 /* Decode via the codec registry */
2663 v = PyCodec_Decode(unicode, encoding, errors);
2664 if (v == NULL)
2665 goto onError;
2666 if (!PyUnicode_Check(v)) {
2667 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002668 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002669 Py_TYPE(v)->tp_name);
2670 Py_DECREF(v);
2671 goto onError;
2672 }
2673 return v;
2674
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002676 return NULL;
2677}
2678
Alexander Belopolsky40018472011-02-26 01:02:56 +00002679PyObject *
2680PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002681 Py_ssize_t size,
2682 const char *encoding,
2683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684{
2685 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002686
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 unicode = PyUnicode_FromUnicode(s, size);
2688 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2691 Py_DECREF(unicode);
2692 return v;
2693}
2694
Alexander Belopolsky40018472011-02-26 01:02:56 +00002695PyObject *
2696PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002697 const char *encoding,
2698 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002699{
2700 PyObject *v;
2701
2702 if (!PyUnicode_Check(unicode)) {
2703 PyErr_BadArgument();
2704 goto onError;
2705 }
2706
2707 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002709
2710 /* Encode via the codec registry */
2711 v = PyCodec_Encode(unicode, encoding, errors);
2712 if (v == NULL)
2713 goto onError;
2714 return v;
2715
Benjamin Peterson29060642009-01-31 22:14:21 +00002716 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002717 return NULL;
2718}
2719
Victor Stinnerad158722010-10-27 00:25:46 +00002720PyObject *
2721PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002722{
Victor Stinner99b95382011-07-04 14:23:54 +02002723#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002724 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2725 PyUnicode_GET_SIZE(unicode),
2726 NULL);
2727#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002729#else
Victor Stinner793b5312011-04-27 00:24:21 +02002730 PyInterpreterState *interp = PyThreadState_GET()->interp;
2731 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2732 cannot use it to encode and decode filenames before it is loaded. Load
2733 the Python codec requires to encode at least its own filename. Use the C
2734 version of the locale codec until the codec registry is initialized and
2735 the Python codec is loaded.
2736
2737 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2738 cannot only rely on it: check also interp->fscodec_initialized for
2739 subinterpreters. */
2740 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002741 return PyUnicode_AsEncodedString(unicode,
2742 Py_FileSystemDefaultEncoding,
2743 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002744 }
2745 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002746 /* locale encoding with surrogateescape */
2747 wchar_t *wchar;
2748 char *bytes;
2749 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002750 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002751
2752 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2753 if (wchar == NULL)
2754 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002755 bytes = _Py_wchar2char(wchar, &error_pos);
2756 if (bytes == NULL) {
2757 if (error_pos != (size_t)-1) {
2758 char *errmsg = strerror(errno);
2759 PyObject *exc = NULL;
2760 if (errmsg == NULL)
2761 errmsg = "Py_wchar2char() failed";
2762 raise_encode_exception(&exc,
2763 "filesystemencoding",
2764 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2765 error_pos, error_pos+1,
2766 errmsg);
2767 Py_XDECREF(exc);
2768 }
2769 else
2770 PyErr_NoMemory();
2771 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002772 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002773 }
2774 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002775
2776 bytes_obj = PyBytes_FromString(bytes);
2777 PyMem_Free(bytes);
2778 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002779 }
Victor Stinnerad158722010-10-27 00:25:46 +00002780#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002781}
2782
Alexander Belopolsky40018472011-02-26 01:02:56 +00002783PyObject *
2784PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002785 const char *encoding,
2786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787{
2788 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002789 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002790
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 if (!PyUnicode_Check(unicode)) {
2792 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 }
Fred Drakee4315f52000-05-09 19:53:39 +00002795
Victor Stinner2f283c22011-03-02 01:21:46 +00002796 if (encoding == NULL) {
2797 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002799 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002801 }
Fred Drakee4315f52000-05-09 19:53:39 +00002802
2803 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002804 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002805 if ((strcmp(lower, "utf-8") == 0) ||
2806 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002807 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002808 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002810 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002812 }
Victor Stinner37296e82010-06-10 13:36:23 +00002813 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002814 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002815 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002817#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002818 else if (strcmp(lower, "mbcs") == 0)
2819 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2820 PyUnicode_GET_SIZE(unicode),
2821 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002822#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002823 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826
2827 /* Encode via the codec registry */
2828 v = PyCodec_Encode(unicode, encoding, errors);
2829 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002830 return NULL;
2831
2832 /* The normal path */
2833 if (PyBytes_Check(v))
2834 return v;
2835
2836 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002837 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002838 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002839 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002840
2841 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2842 "encoder %s returned bytearray instead of bytes",
2843 encoding);
2844 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002845 Py_DECREF(v);
2846 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002847 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002848
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002849 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2850 Py_DECREF(v);
2851 return b;
2852 }
2853
2854 PyErr_Format(PyExc_TypeError,
2855 "encoder did not return a bytes object (type=%.400s)",
2856 Py_TYPE(v)->tp_name);
2857 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002858 return NULL;
2859}
2860
Alexander Belopolsky40018472011-02-26 01:02:56 +00002861PyObject *
2862PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002863 const char *encoding,
2864 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002865{
2866 PyObject *v;
2867
2868 if (!PyUnicode_Check(unicode)) {
2869 PyErr_BadArgument();
2870 goto onError;
2871 }
2872
2873 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002875
2876 /* Encode via the codec registry */
2877 v = PyCodec_Encode(unicode, encoding, errors);
2878 if (v == NULL)
2879 goto onError;
2880 if (!PyUnicode_Check(v)) {
2881 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002882 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002883 Py_TYPE(v)->tp_name);
2884 Py_DECREF(v);
2885 goto onError;
2886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002888
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 return NULL;
2891}
2892
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002893PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002894PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002895 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002896 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2897}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002898
Christian Heimes5894ba72007-11-04 11:43:14 +00002899PyObject*
2900PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2901{
Victor Stinner99b95382011-07-04 14:23:54 +02002902#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002903 return PyUnicode_DecodeMBCS(s, size, NULL);
2904#elif defined(__APPLE__)
2905 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2906#else
Victor Stinner793b5312011-04-27 00:24:21 +02002907 PyInterpreterState *interp = PyThreadState_GET()->interp;
2908 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2909 cannot use it to encode and decode filenames before it is loaded. Load
2910 the Python codec requires to encode at least its own filename. Use the C
2911 version of the locale codec until the codec registry is initialized and
2912 the Python codec is loaded.
2913
2914 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2915 cannot only rely on it: check also interp->fscodec_initialized for
2916 subinterpreters. */
2917 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002918 return PyUnicode_Decode(s, size,
2919 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002920 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002921 }
2922 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002923 /* locale encoding with surrogateescape */
2924 wchar_t *wchar;
2925 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002926 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002927
2928 if (s[size] != '\0' || size != strlen(s)) {
2929 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2930 return NULL;
2931 }
2932
Victor Stinner168e1172010-10-16 23:16:16 +00002933 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002934 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002935 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002936
Victor Stinner168e1172010-10-16 23:16:16 +00002937 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002938 PyMem_Free(wchar);
2939 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002940 }
Victor Stinnerad158722010-10-27 00:25:46 +00002941#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002942}
2943
Martin v. Löwis011e8422009-05-05 04:43:17 +00002944
2945int
2946PyUnicode_FSConverter(PyObject* arg, void* addr)
2947{
2948 PyObject *output = NULL;
2949 Py_ssize_t size;
2950 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002951 if (arg == NULL) {
2952 Py_DECREF(*(PyObject**)addr);
2953 return 1;
2954 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002955 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002956 output = arg;
2957 Py_INCREF(output);
2958 }
2959 else {
2960 arg = PyUnicode_FromObject(arg);
2961 if (!arg)
2962 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002963 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002964 Py_DECREF(arg);
2965 if (!output)
2966 return 0;
2967 if (!PyBytes_Check(output)) {
2968 Py_DECREF(output);
2969 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2970 return 0;
2971 }
2972 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002973 size = PyBytes_GET_SIZE(output);
2974 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002975 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002976 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002977 Py_DECREF(output);
2978 return 0;
2979 }
2980 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002981 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002982}
2983
2984
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002985int
2986PyUnicode_FSDecoder(PyObject* arg, void* addr)
2987{
2988 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002989 if (arg == NULL) {
2990 Py_DECREF(*(PyObject**)addr);
2991 return 1;
2992 }
2993 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002994 if (PyUnicode_READY(arg))
2995 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002996 output = arg;
2997 Py_INCREF(output);
2998 }
2999 else {
3000 arg = PyBytes_FromObject(arg);
3001 if (!arg)
3002 return 0;
3003 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3004 PyBytes_GET_SIZE(arg));
3005 Py_DECREF(arg);
3006 if (!output)
3007 return 0;
3008 if (!PyUnicode_Check(output)) {
3009 Py_DECREF(output);
3010 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3011 return 0;
3012 }
3013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3015 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003016 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3017 Py_DECREF(output);
3018 return 0;
3019 }
3020 *(PyObject**)addr = output;
3021 return Py_CLEANUP_SUPPORTED;
3022}
3023
3024
Martin v. Löwis5b222132007-06-10 09:51:05 +00003025char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003026PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003027{
Christian Heimesf3863112007-11-22 07:46:41 +00003028 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003029 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3030
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003031 if (!PyUnicode_Check(unicode)) {
3032 PyErr_BadArgument();
3033 return NULL;
3034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003035 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003036 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003038 if (PyUnicode_UTF8(unicode) == NULL) {
3039 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003040 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3041 if (bytes == NULL)
3042 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003043 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3044 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003045 Py_DECREF(bytes);
3046 return NULL;
3047 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003048 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3049 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003050 Py_DECREF(bytes);
3051 }
3052
3053 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003054 *psize = PyUnicode_UTF8_LENGTH(unicode);
3055 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003056}
3057
3058char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003061 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3062}
3063
3064#ifdef Py_DEBUG
3065int unicode_as_unicode_calls = 0;
3066#endif
3067
3068
3069Py_UNICODE *
3070PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3071{
3072 PyUnicodeObject *u;
3073 const unsigned char *one_byte;
3074#if SIZEOF_WCHAR_T == 4
3075 const Py_UCS2 *two_bytes;
3076#else
3077 const Py_UCS4 *four_bytes;
3078 const Py_UCS4 *ucs4_end;
3079 Py_ssize_t num_surrogates;
3080#endif
3081 wchar_t *w;
3082 wchar_t *wchar_end;
3083
3084 if (!PyUnicode_Check(unicode)) {
3085 PyErr_BadArgument();
3086 return NULL;
3087 }
3088 u = (PyUnicodeObject*)unicode;
3089 if (_PyUnicode_WSTR(u) == NULL) {
3090 /* Non-ASCII compact unicode object */
3091 assert(_PyUnicode_KIND(u) != 0);
3092 assert(PyUnicode_IS_READY(u));
3093
3094#ifdef Py_DEBUG
3095 ++unicode_as_unicode_calls;
3096#endif
3097
3098 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3099#if SIZEOF_WCHAR_T == 2
3100 four_bytes = PyUnicode_4BYTE_DATA(u);
3101 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3102 num_surrogates = 0;
3103
3104 for (; four_bytes < ucs4_end; ++four_bytes) {
3105 if (*four_bytes > 0xFFFF)
3106 ++num_surrogates;
3107 }
3108
3109 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3110 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3111 if (!_PyUnicode_WSTR(u)) {
3112 PyErr_NoMemory();
3113 return NULL;
3114 }
3115 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3116
3117 w = _PyUnicode_WSTR(u);
3118 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3119 four_bytes = PyUnicode_4BYTE_DATA(u);
3120 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3121 if (*four_bytes > 0xFFFF) {
3122 /* encode surrogate pair in this case */
3123 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3124 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3125 }
3126 else
3127 *w = *four_bytes;
3128
3129 if (w > wchar_end) {
3130 assert(0 && "Miscalculated string end");
3131 }
3132 }
3133 *w = 0;
3134#else
3135 /* sizeof(wchar_t) == 4 */
3136 Py_FatalError("Impossible unicode object state, wstr and str "
3137 "should share memory already.");
3138 return NULL;
3139#endif
3140 }
3141 else {
3142 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3143 (_PyUnicode_LENGTH(u) + 1));
3144 if (!_PyUnicode_WSTR(u)) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
3148 if (!PyUnicode_IS_COMPACT_ASCII(u))
3149 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3150 w = _PyUnicode_WSTR(u);
3151 wchar_end = w + _PyUnicode_LENGTH(u);
3152
3153 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3154 one_byte = PyUnicode_1BYTE_DATA(u);
3155 for (; w < wchar_end; ++one_byte, ++w)
3156 *w = *one_byte;
3157 /* null-terminate the wstr */
3158 *w = 0;
3159 }
3160 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3161#if SIZEOF_WCHAR_T == 4
3162 two_bytes = PyUnicode_2BYTE_DATA(u);
3163 for (; w < wchar_end; ++two_bytes, ++w)
3164 *w = *two_bytes;
3165 /* null-terminate the wstr */
3166 *w = 0;
3167#else
3168 /* sizeof(wchar_t) == 2 */
3169 PyObject_FREE(_PyUnicode_WSTR(u));
3170 _PyUnicode_WSTR(u) = NULL;
3171 Py_FatalError("Impossible unicode object state, wstr "
3172 "and str should share memory already.");
3173 return NULL;
3174#endif
3175 }
3176 else {
3177 assert(0 && "This should never happen.");
3178 }
3179 }
3180 }
3181 if (size != NULL)
3182 *size = PyUnicode_WSTR_LENGTH(u);
3183 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003184}
3185
Alexander Belopolsky40018472011-02-26 01:02:56 +00003186Py_UNICODE *
3187PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003189 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190}
3191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003192
Alexander Belopolsky40018472011-02-26 01:02:56 +00003193Py_ssize_t
3194PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195{
3196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
3198 goto onError;
3199 }
3200 return PyUnicode_GET_SIZE(unicode);
3201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 return -1;
3204}
3205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206Py_ssize_t
3207PyUnicode_GetLength(PyObject *unicode)
3208{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003209 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210 PyErr_BadArgument();
3211 return -1;
3212 }
3213
3214 return PyUnicode_GET_LENGTH(unicode);
3215}
3216
3217Py_UCS4
3218PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3219{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003220 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3221 PyErr_BadArgument();
3222 return (Py_UCS4)-1;
3223 }
3224 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3225 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 return (Py_UCS4)-1;
3227 }
3228 return PyUnicode_READ_CHAR(unicode, index);
3229}
3230
3231int
3232PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3233{
3234 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003235 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003236 return -1;
3237 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003238 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3239 PyErr_SetString(PyExc_IndexError, "string index out of range");
3240 return -1;
3241 }
3242 if (_PyUnicode_Dirty(unicode))
3243 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003244 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3245 index, ch);
3246 return 0;
3247}
3248
Alexander Belopolsky40018472011-02-26 01:02:56 +00003249const char *
3250PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003251{
Victor Stinner42cb4622010-09-01 19:39:01 +00003252 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003253}
3254
Victor Stinner554f3f02010-06-16 23:33:54 +00003255/* create or adjust a UnicodeDecodeError */
3256static void
3257make_decode_exception(PyObject **exceptionObject,
3258 const char *encoding,
3259 const char *input, Py_ssize_t length,
3260 Py_ssize_t startpos, Py_ssize_t endpos,
3261 const char *reason)
3262{
3263 if (*exceptionObject == NULL) {
3264 *exceptionObject = PyUnicodeDecodeError_Create(
3265 encoding, input, length, startpos, endpos, reason);
3266 }
3267 else {
3268 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3269 goto onError;
3270 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3271 goto onError;
3272 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3273 goto onError;
3274 }
3275 return;
3276
3277onError:
3278 Py_DECREF(*exceptionObject);
3279 *exceptionObject = NULL;
3280}
3281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282/* error handling callback helper:
3283 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003284 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 and adjust various state variables.
3286 return 0 on success, -1 on error
3287*/
3288
Alexander Belopolsky40018472011-02-26 01:02:56 +00003289static int
3290unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003291 const char *encoding, const char *reason,
3292 const char **input, const char **inend, Py_ssize_t *startinpos,
3293 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3294 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003296 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297
3298 PyObject *restuple = NULL;
3299 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003300 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003301 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003302 Py_ssize_t requiredsize;
3303 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003305 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003306 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 int res = -1;
3308
3309 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 *errorHandler = PyCodec_LookupError(errors);
3311 if (*errorHandler == NULL)
3312 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 }
3314
Victor Stinner554f3f02010-06-16 23:33:54 +00003315 make_decode_exception(exceptionObject,
3316 encoding,
3317 *input, *inend - *input,
3318 *startinpos, *endinpos,
3319 reason);
3320 if (*exceptionObject == NULL)
3321 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322
3323 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3324 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003327 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003328 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 }
3330 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003331 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003332
3333 /* Copy back the bytes variables, which might have been modified by the
3334 callback */
3335 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3336 if (!inputobj)
3337 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003338 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003340 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003341 *input = PyBytes_AS_STRING(inputobj);
3342 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003343 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003344 /* we can DECREF safely, as the exception has another reference,
3345 so the object won't go away. */
3346 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003350 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003351 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3352 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003353 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354
3355 /* need more space? (at least enough for what we
3356 have+the replacement+the rest of the string (starting
3357 at the new input position), so we won't have to check space
3358 when there are no errors in the rest of the string) */
3359 repptr = PyUnicode_AS_UNICODE(repunicode);
3360 repsize = PyUnicode_GET_SIZE(repunicode);
3361 requiredsize = *outpos + repsize + insize-newpos;
3362 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 if (requiredsize<2*outsize)
3364 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003365 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 goto onError;
3367 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 }
3369 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003370 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 Py_UNICODE_COPY(*outptr, repptr, repsize);
3372 *outptr += repsize;
3373 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 /* we made it! */
3376 res = 0;
3377
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 Py_XDECREF(restuple);
3380 return res;
3381}
3382
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003383/* --- UTF-7 Codec -------------------------------------------------------- */
3384
Antoine Pitrou244651a2009-05-04 18:56:13 +00003385/* See RFC2152 for details. We encode conservatively and decode liberally. */
3386
3387/* Three simple macros defining base-64. */
3388
3389/* Is c a base-64 character? */
3390
3391#define IS_BASE64(c) \
3392 (((c) >= 'A' && (c) <= 'Z') || \
3393 ((c) >= 'a' && (c) <= 'z') || \
3394 ((c) >= '0' && (c) <= '9') || \
3395 (c) == '+' || (c) == '/')
3396
3397/* given that c is a base-64 character, what is its base-64 value? */
3398
3399#define FROM_BASE64(c) \
3400 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3401 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3402 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3403 (c) == '+' ? 62 : 63)
3404
3405/* What is the base-64 character of the bottom 6 bits of n? */
3406
3407#define TO_BASE64(n) \
3408 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3409
3410/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3411 * decoded as itself. We are permissive on decoding; the only ASCII
3412 * byte not decoding to itself is the + which begins a base64
3413 * string. */
3414
3415#define DECODE_DIRECT(c) \
3416 ((c) <= 127 && (c) != '+')
3417
3418/* The UTF-7 encoder treats ASCII characters differently according to
3419 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3420 * the above). See RFC2152. This array identifies these different
3421 * sets:
3422 * 0 : "Set D"
3423 * alphanumeric and '(),-./:?
3424 * 1 : "Set O"
3425 * !"#$%&*;<=>@[]^_`{|}
3426 * 2 : "whitespace"
3427 * ht nl cr sp
3428 * 3 : special (must be base64 encoded)
3429 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3430 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003431
Tim Petersced69f82003-09-16 20:30:58 +00003432static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003433char utf7_category[128] = {
3434/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3435 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3436/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3437 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3438/* sp ! " # $ % & ' ( ) * + , - . / */
3439 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3440/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3442/* @ A B C D E F G H I J K L M N O */
3443 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3444/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3446/* ` a b c d e f g h i j k l m n o */
3447 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3448/* p q r s t u v w x y z { | } ~ del */
3449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003450};
3451
Antoine Pitrou244651a2009-05-04 18:56:13 +00003452/* ENCODE_DIRECT: this character should be encoded as itself. The
3453 * answer depends on whether we are encoding set O as itself, and also
3454 * on whether we are encoding whitespace as itself. RFC2152 makes it
3455 * clear that the answers to these questions vary between
3456 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003457
Antoine Pitrou244651a2009-05-04 18:56:13 +00003458#define ENCODE_DIRECT(c, directO, directWS) \
3459 ((c) < 128 && (c) > 0 && \
3460 ((utf7_category[(c)] == 0) || \
3461 (directWS && (utf7_category[(c)] == 2)) || \
3462 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003463
Alexander Belopolsky40018472011-02-26 01:02:56 +00003464PyObject *
3465PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003466 Py_ssize_t size,
3467 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003468{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003469 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3470}
3471
Antoine Pitrou244651a2009-05-04 18:56:13 +00003472/* The decoder. The only state we preserve is our read position,
3473 * i.e. how many characters we have consumed. So if we end in the
3474 * middle of a shift sequence we have to back off the read position
3475 * and the output to the beginning of the sequence, otherwise we lose
3476 * all the shift state (seen bits, number of bits seen, high
3477 * surrogate). */
3478
Alexander Belopolsky40018472011-02-26 01:02:56 +00003479PyObject *
3480PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003481 Py_ssize_t size,
3482 const char *errors,
3483 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003486 Py_ssize_t startinpos;
3487 Py_ssize_t endinpos;
3488 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003489 const char *e;
3490 PyUnicodeObject *unicode;
3491 Py_UNICODE *p;
3492 const char *errmsg = "";
3493 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003494 Py_UNICODE *shiftOutStart;
3495 unsigned int base64bits = 0;
3496 unsigned long base64buffer = 0;
3497 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 PyObject *errorHandler = NULL;
3499 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003500
3501 unicode = _PyUnicode_New(size);
3502 if (!unicode)
3503 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003504 if (size == 0) {
3505 if (consumed)
3506 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003507 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003508 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003510 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003512 e = s + size;
3513
3514 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003517 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003518
Antoine Pitrou244651a2009-05-04 18:56:13 +00003519 if (inShift) { /* in a base-64 section */
3520 if (IS_BASE64(ch)) { /* consume a base-64 character */
3521 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3522 base64bits += 6;
3523 s++;
3524 if (base64bits >= 16) {
3525 /* we have enough bits for a UTF-16 value */
3526 Py_UNICODE outCh = (Py_UNICODE)
3527 (base64buffer >> (base64bits-16));
3528 base64bits -= 16;
3529 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3530 if (surrogate) {
3531 /* expecting a second surrogate */
3532 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3533#ifdef Py_UNICODE_WIDE
3534 *p++ = (((surrogate & 0x3FF)<<10)
3535 | (outCh & 0x3FF)) + 0x10000;
3536#else
3537 *p++ = surrogate;
3538 *p++ = outCh;
3539#endif
3540 surrogate = 0;
3541 }
3542 else {
3543 surrogate = 0;
3544 errmsg = "second surrogate missing";
3545 goto utf7Error;
3546 }
3547 }
3548 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3549 /* first surrogate */
3550 surrogate = outCh;
3551 }
3552 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3553 errmsg = "unexpected second surrogate";
3554 goto utf7Error;
3555 }
3556 else {
3557 *p++ = outCh;
3558 }
3559 }
3560 }
3561 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003562 inShift = 0;
3563 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003564 if (surrogate) {
3565 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003566 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568 if (base64bits > 0) { /* left-over bits */
3569 if (base64bits >= 6) {
3570 /* We've seen at least one base-64 character */
3571 errmsg = "partial character in shift sequence";
3572 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003574 else {
3575 /* Some bits remain; they should be zero */
3576 if (base64buffer != 0) {
3577 errmsg = "non-zero padding bits in shift sequence";
3578 goto utf7Error;
3579 }
3580 }
3581 }
3582 if (ch != '-') {
3583 /* '-' is absorbed; other terminating
3584 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 *p++ = ch;
3586 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003587 }
3588 }
3589 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003591 s++; /* consume '+' */
3592 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003593 s++;
3594 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003595 }
3596 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003597 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003598 shiftOutStart = p;
3599 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003600 }
3601 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003603 *p++ = ch;
3604 s++;
3605 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003606 else {
3607 startinpos = s-starts;
3608 s++;
3609 errmsg = "unexpected special character";
3610 goto utf7Error;
3611 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003612 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 outpos = p-PyUnicode_AS_UNICODE(unicode);
3615 endinpos = s-starts;
3616 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 errors, &errorHandler,
3618 "utf7", errmsg,
3619 &starts, &e, &startinpos, &endinpos, &exc, &s,
3620 &unicode, &outpos, &p))
3621 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622 }
3623
Antoine Pitrou244651a2009-05-04 18:56:13 +00003624 /* end of string */
3625
3626 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3627 /* if we're in an inconsistent state, that's an error */
3628 if (surrogate ||
3629 (base64bits >= 6) ||
3630 (base64bits > 0 && base64buffer != 0)) {
3631 outpos = p-PyUnicode_AS_UNICODE(unicode);
3632 endinpos = size;
3633 if (unicode_decode_call_errorhandler(
3634 errors, &errorHandler,
3635 "utf7", "unterminated shift sequence",
3636 &starts, &e, &startinpos, &endinpos, &exc, &s,
3637 &unicode, &outpos, &p))
3638 goto onError;
3639 if (s < e)
3640 goto restart;
3641 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003642 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003643
3644 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003645 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003646 if (inShift) {
3647 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003648 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003649 }
3650 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003651 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003652 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003653 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003654
Victor Stinnerfe226c02011-10-03 03:52:20 +02003655 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003656 goto onError;
3657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 Py_XDECREF(errorHandler);
3659 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003660 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 Py_DECREF(unicode);
3662 return NULL;
3663 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003664 return (PyObject *)unicode;
3665
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 Py_XDECREF(errorHandler);
3668 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669 Py_DECREF(unicode);
3670 return NULL;
3671}
3672
3673
Alexander Belopolsky40018472011-02-26 01:02:56 +00003674PyObject *
3675PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003676 Py_ssize_t size,
3677 int base64SetO,
3678 int base64WhiteSpace,
3679 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003680{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003683 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003684 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003685 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003686 unsigned int base64bits = 0;
3687 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688 char * out;
3689 char * start;
3690
3691 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003693
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003694 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695 return PyErr_NoMemory();
3696
Antoine Pitrou244651a2009-05-04 18:56:13 +00003697 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003698 if (v == NULL)
3699 return NULL;
3700
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003701 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003702 for (;i < size; ++i) {
3703 Py_UNICODE ch = s[i];
3704
Antoine Pitrou244651a2009-05-04 18:56:13 +00003705 if (inShift) {
3706 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3707 /* shifting out */
3708 if (base64bits) { /* output remaining bits */
3709 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3710 base64buffer = 0;
3711 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003712 }
3713 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003714 /* Characters not in the BASE64 set implicitly unshift the sequence
3715 so no '-' is required, except if the character is itself a '-' */
3716 if (IS_BASE64(ch) || ch == '-') {
3717 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003719 *out++ = (char) ch;
3720 }
3721 else {
3722 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003724 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003725 else { /* not in a shift sequence */
3726 if (ch == '+') {
3727 *out++ = '+';
3728 *out++ = '-';
3729 }
3730 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3731 *out++ = (char) ch;
3732 }
3733 else {
3734 *out++ = '+';
3735 inShift = 1;
3736 goto encode_char;
3737 }
3738 }
3739 continue;
3740encode_char:
3741#ifdef Py_UNICODE_WIDE
3742 if (ch >= 0x10000) {
3743 /* code first surrogate */
3744 base64bits += 16;
3745 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3746 while (base64bits >= 6) {
3747 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3748 base64bits -= 6;
3749 }
3750 /* prepare second surrogate */
3751 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3752 }
3753#endif
3754 base64bits += 16;
3755 base64buffer = (base64buffer << 16) | ch;
3756 while (base64bits >= 6) {
3757 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3758 base64bits -= 6;
3759 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003760 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003761 if (base64bits)
3762 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3763 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003764 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003765 if (_PyBytes_Resize(&v, out - start) < 0)
3766 return NULL;
3767 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768}
3769
Antoine Pitrou244651a2009-05-04 18:56:13 +00003770#undef IS_BASE64
3771#undef FROM_BASE64
3772#undef TO_BASE64
3773#undef DECODE_DIRECT
3774#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776/* --- UTF-8 Codec -------------------------------------------------------- */
3777
Tim Petersced69f82003-09-16 20:30:58 +00003778static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003780 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3781 illegal prefix. See RFC 3629 for details */
3782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003789 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3792 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003793 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3794 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3795 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3796 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3797 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798};
3799
Alexander Belopolsky40018472011-02-26 01:02:56 +00003800PyObject *
3801PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003802 Py_ssize_t size,
3803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804{
Walter Dörwald69652032004-09-07 20:24:22 +00003805 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3806}
3807
Antoine Pitrouab868312009-01-10 15:40:25 +00003808/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3809#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3810
3811/* Mask to quickly check whether a C 'long' contains a
3812 non-ASCII, UTF8-encoded char. */
3813#if (SIZEOF_LONG == 8)
3814# define ASCII_CHAR_MASK 0x8080808080808080L
3815#elif (SIZEOF_LONG == 4)
3816# define ASCII_CHAR_MASK 0x80808080L
3817#else
3818# error C 'long' size should be either 4 or 8!
3819#endif
3820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821/* Scans a UTF-8 string and returns the maximum character to be expected,
3822 the size of the decoded unicode string and if any major errors were
3823 encountered.
3824
3825 This function does check basic UTF-8 sanity, it does however NOT CHECK
3826 if the string contains surrogates, and if all continuation bytes are
3827 within the correct ranges, these checks are performed in
3828 PyUnicode_DecodeUTF8Stateful.
3829
3830 If it sets has_errors to 1, it means the value of unicode_size and max_char
3831 will be bogus and you should not rely on useful information in them.
3832 */
3833static Py_UCS4
3834utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3835 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3836 int *has_errors)
3837{
3838 Py_ssize_t n;
3839 Py_ssize_t char_count = 0;
3840 Py_UCS4 max_char = 127, new_max;
3841 Py_UCS4 upper_bound;
3842 const unsigned char *p = (const unsigned char *)s;
3843 const unsigned char *end = p + string_size;
3844 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3845 int err = 0;
3846
3847 for (; p < end && !err; ++p, ++char_count) {
3848 /* Only check value if it's not a ASCII char... */
3849 if (*p < 0x80) {
3850 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3851 an explanation. */
3852 if (!((size_t) p & LONG_PTR_MASK)) {
3853 /* Help register allocation */
3854 register const unsigned char *_p = p;
3855 while (_p < aligned_end) {
3856 unsigned long value = *(unsigned long *) _p;
3857 if (value & ASCII_CHAR_MASK)
3858 break;
3859 _p += SIZEOF_LONG;
3860 char_count += SIZEOF_LONG;
3861 }
3862 p = _p;
3863 if (p == end)
3864 break;
3865 }
3866 }
3867 if (*p >= 0x80) {
3868 n = utf8_code_length[*p];
3869 new_max = max_char;
3870 switch (n) {
3871 /* invalid start byte */
3872 case 0:
3873 err = 1;
3874 break;
3875 case 2:
3876 /* Code points between 0x00FF and 0x07FF inclusive.
3877 Approximate the upper bound of the code point,
3878 if this flips over 255 we can be sure it will be more
3879 than 255 and the string will need 2 bytes per code coint,
3880 if it stays under or equal to 255, we can be sure 1 byte
3881 is enough.
3882 ((*p & 0b00011111) << 6) | 0b00111111 */
3883 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3884 if (max_char < upper_bound)
3885 new_max = upper_bound;
3886 /* Ensure we track at least that we left ASCII space. */
3887 if (new_max < 128)
3888 new_max = 128;
3889 break;
3890 case 3:
3891 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3892 always > 255 and <= 65535 and will always need 2 bytes. */
3893 if (max_char < 65535)
3894 new_max = 65535;
3895 break;
3896 case 4:
3897 /* Code point will be above 0xFFFF for sure in this case. */
3898 new_max = 65537;
3899 break;
3900 /* Internal error, this should be caught by the first if */
3901 case 1:
3902 default:
3903 assert(0 && "Impossible case in utf8_max_char_and_size");
3904 err = 1;
3905 }
3906 /* Instead of number of overall bytes for this code point,
3907 n containts the number of following bytes: */
3908 --n;
3909 /* Check if the follow up chars are all valid continuation bytes */
3910 if (n >= 1) {
3911 const unsigned char *cont;
3912 if ((p + n) >= end) {
3913 if (consumed == 0)
3914 /* incomplete data, non-incremental decoding */
3915 err = 1;
3916 break;
3917 }
3918 for (cont = p + 1; cont < (p + n); ++cont) {
3919 if ((*cont & 0xc0) != 0x80) {
3920 err = 1;
3921 break;
3922 }
3923 }
3924 p += n;
3925 }
3926 else
3927 err = 1;
3928 max_char = new_max;
3929 }
3930 }
3931
3932 if (unicode_size)
3933 *unicode_size = char_count;
3934 if (has_errors)
3935 *has_errors = err;
3936 return max_char;
3937}
3938
3939/* Similar to PyUnicode_WRITE but can also write into wstr field
3940 of the legacy unicode representation */
3941#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3942 do { \
3943 const int k_ = (kind); \
3944 if (k_ == PyUnicode_WCHAR_KIND) \
3945 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3946 else if (k_ == PyUnicode_1BYTE_KIND) \
3947 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3948 else if (k_ == PyUnicode_2BYTE_KIND) \
3949 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3950 else \
3951 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3952 } while (0)
3953
Alexander Belopolsky40018472011-02-26 01:02:56 +00003954PyObject *
3955PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 Py_ssize_t size,
3957 const char *errors,
3958 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003962 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t startinpos;
3964 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003965 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003967 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 PyObject *errorHandler = NULL;
3969 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 Py_UCS4 maxchar = 0;
3971 Py_ssize_t unicode_size;
3972 Py_ssize_t i;
3973 int kind;
3974 void *data;
3975 int has_errors;
3976 Py_UNICODE *error_outptr;
3977#if SIZEOF_WCHAR_T == 2
3978 Py_ssize_t wchar_offset = 0;
3979#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980
Walter Dörwald69652032004-09-07 20:24:22 +00003981 if (size == 0) {
3982 if (consumed)
3983 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3987 consumed, &has_errors);
3988 if (has_errors) {
3989 unicode = _PyUnicode_New(size);
3990 if (!unicode)
3991 return NULL;
3992 kind = PyUnicode_WCHAR_KIND;
3993 data = PyUnicode_AS_UNICODE(unicode);
3994 assert(data != NULL);
3995 }
3996 else {
3997 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3998 if (!unicode)
3999 return NULL;
4000 /* When the string is ASCII only, just use memcpy and return.
4001 unicode_size may be != size if there is an incomplete UTF-8
4002 sequence at the end of the ASCII block. */
4003 if (maxchar < 128 && size == unicode_size) {
4004 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4005 return (PyObject *)unicode;
4006 }
4007 kind = PyUnicode_KIND(unicode);
4008 data = PyUnicode_DATA(unicode);
4009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004013 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
4015 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004016 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017
4018 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004019 /* Fast path for runs of ASCII characters. Given that common UTF-8
4020 input will consist of an overwhelming majority of ASCII
4021 characters, we try to optimize for this case by checking
4022 as many characters as a C 'long' can contain.
4023 First, check if we can do an aligned read, as most CPUs have
4024 a penalty for unaligned reads.
4025 */
4026 if (!((size_t) s & LONG_PTR_MASK)) {
4027 /* Help register allocation */
4028 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004030 while (_s < aligned_end) {
4031 /* Read a whole long at a time (either 4 or 8 bytes),
4032 and do a fast unrolled copy if it only contains ASCII
4033 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 unsigned long value = *(unsigned long *) _s;
4035 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004036 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4038 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4039 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4040 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004041#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4043 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4044 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4045 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004046#endif
4047 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004049 }
4050 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004052 if (s == e)
4053 break;
4054 ch = (unsigned char)*s;
4055 }
4056 }
4057
4058 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 s++;
4061 continue;
4062 }
4063
4064 n = utf8_code_length[ch];
4065
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004066 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 if (consumed)
4068 break;
4069 else {
4070 errmsg = "unexpected end of data";
4071 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004072 endinpos = startinpos+1;
4073 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4074 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 goto utf8Error;
4076 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078
4079 switch (n) {
4080
4081 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004082 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004083 startinpos = s-starts;
4084 endinpos = startinpos+1;
4085 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086
4087 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004088 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 startinpos = s-starts;
4090 endinpos = startinpos+1;
4091 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092
4093 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004094 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004095 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004097 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 goto utf8Error;
4099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004101 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 break;
4104
4105 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004106 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4107 will result in surrogates in range d800-dfff. Surrogates are
4108 not valid UTF-8 so they are rejected.
4109 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4110 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004111 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004112 (s[2] & 0xc0) != 0x80 ||
4113 ((unsigned char)s[0] == 0xE0 &&
4114 (unsigned char)s[1] < 0xA0) ||
4115 ((unsigned char)s[0] == 0xED &&
4116 (unsigned char)s[1] > 0x9F)) {
4117 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004119 endinpos = startinpos + 1;
4120
4121 /* if s[1] first two bits are 1 and 0, then the invalid
4122 continuation byte is s[2], so increment endinpos by 1,
4123 if not, s[1] is invalid and endinpos doesn't need to
4124 be incremented. */
4125 if ((s[1] & 0xC0) == 0x80)
4126 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 goto utf8Error;
4128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004130 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004132 break;
4133
4134 case 4:
4135 if ((s[1] & 0xc0) != 0x80 ||
4136 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004137 (s[3] & 0xc0) != 0x80 ||
4138 ((unsigned char)s[0] == 0xF0 &&
4139 (unsigned char)s[1] < 0x90) ||
4140 ((unsigned char)s[0] == 0xF4 &&
4141 (unsigned char)s[1] > 0x8F)) {
4142 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004144 endinpos = startinpos + 1;
4145 if ((s[1] & 0xC0) == 0x80) {
4146 endinpos++;
4147 if ((s[2] & 0xC0) == 0x80)
4148 endinpos++;
4149 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 goto utf8Error;
4151 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004152 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004153 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4154 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004156 /* If the string is flexible or we have native UCS-4, write
4157 directly.. */
4158 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4159 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 else {
4162 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 /* translate from 10000..10FFFF to 0..FFFF */
4165 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 /* high surrogate = top 10 bits added to D800 */
4168 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4169 (Py_UNICODE)(0xD800 + (ch >> 10)));
4170
4171 /* low surrogate = bottom 10 bits added to DC00 */
4172 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4173 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4174 }
4175#if SIZEOF_WCHAR_T == 2
4176 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004177#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 }
4180 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004182
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 /* If this is not yet a resizable string, make it one.. */
4185 if (kind != PyUnicode_WCHAR_KIND) {
4186 const Py_UNICODE *u;
4187 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4188 if (!new_unicode)
4189 goto onError;
4190 u = PyUnicode_AsUnicode((PyObject *)unicode);
4191 if (!u)
4192 goto onError;
4193#if SIZEOF_WCHAR_T == 2
4194 i += wchar_offset;
4195#endif
4196 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4197 Py_DECREF(unicode);
4198 unicode = new_unicode;
4199 kind = 0;
4200 data = PyUnicode_AS_UNICODE(new_unicode);
4201 assert(data != NULL);
4202 }
4203 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 if (unicode_decode_call_errorhandler(
4205 errors, &errorHandler,
4206 "utf8", errmsg,
4207 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004209 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 /* Update data because unicode_decode_call_errorhandler might have
4211 re-created or resized the unicode object. */
4212 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 /* Ensure the unicode_size calculation above was correct: */
4216 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4217
Walter Dörwald69652032004-09-07 20:24:22 +00004218 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221 /* Adjust length and ready string when it contained errors and
4222 is of the old resizable kind. */
4223 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004224 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225 goto onError;
4226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 Py_XDECREF(errorHandler);
4229 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004230 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231 Py_DECREF(unicode);
4232 return NULL;
4233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234 return (PyObject *)unicode;
4235
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_XDECREF(errorHandler);
4238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 Py_DECREF(unicode);
4240 return NULL;
4241}
4242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004244
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004245#ifdef __APPLE__
4246
4247/* Simplified UTF-8 decoder using surrogateescape error handler,
4248 used to decode the command line arguments on Mac OS X. */
4249
4250wchar_t*
4251_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4252{
4253 int n;
4254 const char *e;
4255 wchar_t *unicode, *p;
4256
4257 /* Note: size will always be longer than the resulting Unicode
4258 character count */
4259 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4260 PyErr_NoMemory();
4261 return NULL;
4262 }
4263 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4264 if (!unicode)
4265 return NULL;
4266
4267 /* Unpack UTF-8 encoded data */
4268 p = unicode;
4269 e = s + size;
4270 while (s < e) {
4271 Py_UCS4 ch = (unsigned char)*s;
4272
4273 if (ch < 0x80) {
4274 *p++ = (wchar_t)ch;
4275 s++;
4276 continue;
4277 }
4278
4279 n = utf8_code_length[ch];
4280 if (s + n > e) {
4281 goto surrogateescape;
4282 }
4283
4284 switch (n) {
4285 case 0:
4286 case 1:
4287 goto surrogateescape;
4288
4289 case 2:
4290 if ((s[1] & 0xc0) != 0x80)
4291 goto surrogateescape;
4292 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4293 assert ((ch > 0x007F) && (ch <= 0x07FF));
4294 *p++ = (wchar_t)ch;
4295 break;
4296
4297 case 3:
4298 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4299 will result in surrogates in range d800-dfff. Surrogates are
4300 not valid UTF-8 so they are rejected.
4301 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4302 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4303 if ((s[1] & 0xc0) != 0x80 ||
4304 (s[2] & 0xc0) != 0x80 ||
4305 ((unsigned char)s[0] == 0xE0 &&
4306 (unsigned char)s[1] < 0xA0) ||
4307 ((unsigned char)s[0] == 0xED &&
4308 (unsigned char)s[1] > 0x9F)) {
4309
4310 goto surrogateescape;
4311 }
4312 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4313 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004314 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004315 break;
4316
4317 case 4:
4318 if ((s[1] & 0xc0) != 0x80 ||
4319 (s[2] & 0xc0) != 0x80 ||
4320 (s[3] & 0xc0) != 0x80 ||
4321 ((unsigned char)s[0] == 0xF0 &&
4322 (unsigned char)s[1] < 0x90) ||
4323 ((unsigned char)s[0] == 0xF4 &&
4324 (unsigned char)s[1] > 0x8F)) {
4325 goto surrogateescape;
4326 }
4327 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4328 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4329 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4330
4331#if SIZEOF_WCHAR_T == 4
4332 *p++ = (wchar_t)ch;
4333#else
4334 /* compute and append the two surrogates: */
4335
4336 /* translate from 10000..10FFFF to 0..FFFF */
4337 ch -= 0x10000;
4338
4339 /* high surrogate = top 10 bits added to D800 */
4340 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4341
4342 /* low surrogate = bottom 10 bits added to DC00 */
4343 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4344#endif
4345 break;
4346 }
4347 s += n;
4348 continue;
4349
4350 surrogateescape:
4351 *p++ = 0xDC00 + ch;
4352 s++;
4353 }
4354 *p = L'\0';
4355 return unicode;
4356}
4357
4358#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360/* Primary internal function which creates utf8 encoded bytes objects.
4361
4362 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004363 and allocate exactly as much space needed at the end. Else allocate the
4364 maximum possible needed (4 result bytes per Unicode character), and return
4365 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004366*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004367PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004368_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369{
Tim Peters602f7402002-04-27 18:03:26 +00004370#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004371
Guido van Rossum98297ee2007-11-06 21:34:58 +00004372 Py_ssize_t i; /* index into s of next input byte */
4373 PyObject *result; /* result string object */
4374 char *p; /* next free byte in output buffer */
4375 Py_ssize_t nallocated; /* number of result bytes allocated */
4376 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004377 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 int kind;
4381 void *data;
4382 Py_ssize_t size;
4383 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4384#if SIZEOF_WCHAR_T == 2
4385 Py_ssize_t wchar_offset = 0;
4386#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004388 if (!PyUnicode_Check(unicode)) {
4389 PyErr_BadArgument();
4390 return NULL;
4391 }
4392
4393 if (PyUnicode_READY(unicode) == -1)
4394 return NULL;
4395
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004396 if (PyUnicode_UTF8(unicode))
4397 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4398 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004399
4400 kind = PyUnicode_KIND(unicode);
4401 data = PyUnicode_DATA(unicode);
4402 size = PyUnicode_GET_LENGTH(unicode);
4403
Tim Peters602f7402002-04-27 18:03:26 +00004404 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
Tim Peters602f7402002-04-27 18:03:26 +00004406 if (size <= MAX_SHORT_UNICHARS) {
4407 /* Write into the stack buffer; nallocated can't overflow.
4408 * At the end, we'll allocate exactly as much heap space as it
4409 * turns out we need.
4410 */
4411 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004412 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004413 p = stackbuf;
4414 }
4415 else {
4416 /* Overallocate on the heap, and give the excess back at the end. */
4417 nallocated = size * 4;
4418 if (nallocated / 4 != size) /* overflow! */
4419 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004420 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004421 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004422 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004423 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004424 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004425
Tim Peters602f7402002-04-27 18:03:26 +00004426 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004427 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004428
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004429 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004430 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004434 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004435 *p++ = (char)(0xc0 | (ch >> 6));
4436 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004437 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438 Py_ssize_t newpos;
4439 PyObject *rep;
4440 Py_ssize_t repsize, k, startpos;
4441 startpos = i-1;
4442#if SIZEOF_WCHAR_T == 2
4443 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004444#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004445 rep = unicode_encode_call_errorhandler(
4446 errors, &errorHandler, "utf-8", "surrogates not allowed",
4447 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4448 &exc, startpos, startpos+1, &newpos);
4449 if (!rep)
4450 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004452 if (PyBytes_Check(rep))
4453 repsize = PyBytes_GET_SIZE(rep);
4454 else
4455 repsize = PyUnicode_GET_SIZE(rep);
4456
4457 if (repsize > 4) {
4458 Py_ssize_t offset;
4459
4460 if (result == NULL)
4461 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004462 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004465 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4466 /* integer overflow */
4467 PyErr_NoMemory();
4468 goto error;
4469 }
4470 nallocated += repsize - 4;
4471 if (result != NULL) {
4472 if (_PyBytes_Resize(&result, nallocated) < 0)
4473 goto error;
4474 } else {
4475 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004476 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 goto error;
4478 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4479 }
4480 p = PyBytes_AS_STRING(result) + offset;
4481 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004483 if (PyBytes_Check(rep)) {
4484 char *prep = PyBytes_AS_STRING(rep);
4485 for(k = repsize; k > 0; k--)
4486 *p++ = *prep++;
4487 } else /* rep is unicode */ {
4488 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4489 Py_UNICODE c;
4490
4491 for(k=0; k<repsize; k++) {
4492 c = prep[k];
4493 if (0x80 <= c) {
4494 raise_encode_exception(&exc, "utf-8",
4495 PyUnicode_AS_UNICODE(unicode),
4496 size, i-1, i,
4497 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004498 goto error;
4499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004501 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004503 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004504 } else if (ch < 0x10000) {
4505 *p++ = (char)(0xe0 | (ch >> 12));
4506 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4507 *p++ = (char)(0x80 | (ch & 0x3f));
4508 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004509 /* Encode UCS4 Unicode ordinals */
4510 *p++ = (char)(0xf0 | (ch >> 18));
4511 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4512 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4513 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514#if SIZEOF_WCHAR_T == 2
4515 wchar_offset++;
4516#endif
Tim Peters602f7402002-04-27 18:03:26 +00004517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004519
Guido van Rossum98297ee2007-11-06 21:34:58 +00004520 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004521 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004522 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004523 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004524 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004525 }
4526 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004527 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004528 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004529 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004530 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004533 Py_XDECREF(errorHandler);
4534 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004535 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004536 error:
4537 Py_XDECREF(errorHandler);
4538 Py_XDECREF(exc);
4539 Py_XDECREF(result);
4540 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004541
Tim Peters602f7402002-04-27 18:03:26 +00004542#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543}
4544
Alexander Belopolsky40018472011-02-26 01:02:56 +00004545PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004546PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4547 Py_ssize_t size,
4548 const char *errors)
4549{
4550 PyObject *v, *unicode;
4551
4552 unicode = PyUnicode_FromUnicode(s, size);
4553 if (unicode == NULL)
4554 return NULL;
4555 v = _PyUnicode_AsUTF8String(unicode, errors);
4556 Py_DECREF(unicode);
4557 return v;
4558}
4559
4560PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004561PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004563 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564}
4565
Walter Dörwald41980ca2007-08-16 21:55:45 +00004566/* --- UTF-32 Codec ------------------------------------------------------- */
4567
4568PyObject *
4569PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 Py_ssize_t size,
4571 const char *errors,
4572 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004573{
4574 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4575}
4576
4577PyObject *
4578PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 Py_ssize_t size,
4580 const char *errors,
4581 int *byteorder,
4582 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004583{
4584 const char *starts = s;
4585 Py_ssize_t startinpos;
4586 Py_ssize_t endinpos;
4587 Py_ssize_t outpos;
4588 PyUnicodeObject *unicode;
4589 Py_UNICODE *p;
4590#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004591 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004592 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004593#else
4594 const int pairs = 0;
4595#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004596 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004597 int bo = 0; /* assume native ordering by default */
4598 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004599 /* Offsets from q for retrieving bytes in the right order. */
4600#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4601 int iorder[] = {0, 1, 2, 3};
4602#else
4603 int iorder[] = {3, 2, 1, 0};
4604#endif
4605 PyObject *errorHandler = NULL;
4606 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004607
Walter Dörwald41980ca2007-08-16 21:55:45 +00004608 q = (unsigned char *)s;
4609 e = q + size;
4610
4611 if (byteorder)
4612 bo = *byteorder;
4613
4614 /* Check for BOM marks (U+FEFF) in the input and adjust current
4615 byte order setting accordingly. In native mode, the leading BOM
4616 mark is skipped, in all other modes, it is copied to the output
4617 stream as-is (giving a ZWNBSP character). */
4618 if (bo == 0) {
4619 if (size >= 4) {
4620 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 if (bom == 0x0000FEFF) {
4624 q += 4;
4625 bo = -1;
4626 }
4627 else if (bom == 0xFFFE0000) {
4628 q += 4;
4629 bo = 1;
4630 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004631#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 if (bom == 0x0000FEFF) {
4633 q += 4;
4634 bo = 1;
4635 }
4636 else if (bom == 0xFFFE0000) {
4637 q += 4;
4638 bo = -1;
4639 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004640#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004642 }
4643
4644 if (bo == -1) {
4645 /* force LE */
4646 iorder[0] = 0;
4647 iorder[1] = 1;
4648 iorder[2] = 2;
4649 iorder[3] = 3;
4650 }
4651 else if (bo == 1) {
4652 /* force BE */
4653 iorder[0] = 3;
4654 iorder[1] = 2;
4655 iorder[2] = 1;
4656 iorder[3] = 0;
4657 }
4658
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004659 /* On narrow builds we split characters outside the BMP into two
4660 codepoints => count how much extra space we need. */
4661#ifndef Py_UNICODE_WIDE
4662 for (qq = q; qq < e; qq += 4)
4663 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4664 pairs++;
4665#endif
4666
4667 /* This might be one to much, because of a BOM */
4668 unicode = _PyUnicode_New((size+3)/4+pairs);
4669 if (!unicode)
4670 return NULL;
4671 if (size == 0)
4672 return (PyObject *)unicode;
4673
4674 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004675 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004676
Walter Dörwald41980ca2007-08-16 21:55:45 +00004677 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 Py_UCS4 ch;
4679 /* remaining bytes at the end? (size should be divisible by 4) */
4680 if (e-q<4) {
4681 if (consumed)
4682 break;
4683 errmsg = "truncated data";
4684 startinpos = ((const char *)q)-starts;
4685 endinpos = ((const char *)e)-starts;
4686 goto utf32Error;
4687 /* The remaining input chars are ignored if the callback
4688 chooses to skip the input */
4689 }
4690 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4691 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004692
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 if (ch >= 0x110000)
4694 {
4695 errmsg = "codepoint not in range(0x110000)";
4696 startinpos = ((const char *)q)-starts;
4697 endinpos = startinpos+4;
4698 goto utf32Error;
4699 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (ch >= 0x10000)
4702 {
4703 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4704 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4705 }
4706 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004707#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 *p++ = ch;
4709 q += 4;
4710 continue;
4711 utf32Error:
4712 outpos = p-PyUnicode_AS_UNICODE(unicode);
4713 if (unicode_decode_call_errorhandler(
4714 errors, &errorHandler,
4715 "utf32", errmsg,
4716 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4717 &unicode, &outpos, &p))
4718 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004719 }
4720
4721 if (byteorder)
4722 *byteorder = bo;
4723
4724 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004726
4727 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004728 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004729 goto onError;
4730
4731 Py_XDECREF(errorHandler);
4732 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004733 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 Py_DECREF(unicode);
4735 return NULL;
4736 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737 return (PyObject *)unicode;
4738
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004740 Py_DECREF(unicode);
4741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
4743 return NULL;
4744}
4745
4746PyObject *
4747PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 Py_ssize_t size,
4749 const char *errors,
4750 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004751{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004752 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004754 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004755#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004756 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757#else
4758 const int pairs = 0;
4759#endif
4760 /* Offsets from p for storing byte pairs in the right order. */
4761#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4762 int iorder[] = {0, 1, 2, 3};
4763#else
4764 int iorder[] = {3, 2, 1, 0};
4765#endif
4766
Benjamin Peterson29060642009-01-31 22:14:21 +00004767#define STORECHAR(CH) \
4768 do { \
4769 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4770 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4771 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4772 p[iorder[0]] = (CH) & 0xff; \
4773 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004774 } while(0)
4775
4776 /* In narrow builds we can output surrogate pairs as one codepoint,
4777 so we need less space. */
4778#ifndef Py_UNICODE_WIDE
4779 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4781 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4782 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004783#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004784 nsize = (size - pairs + (byteorder == 0));
4785 bytesize = nsize * 4;
4786 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004788 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004789 if (v == NULL)
4790 return NULL;
4791
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004792 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004793 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004795 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004796 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004797
4798 if (byteorder == -1) {
4799 /* force LE */
4800 iorder[0] = 0;
4801 iorder[1] = 1;
4802 iorder[2] = 2;
4803 iorder[3] = 3;
4804 }
4805 else if (byteorder == 1) {
4806 /* force BE */
4807 iorder[0] = 3;
4808 iorder[1] = 2;
4809 iorder[2] = 1;
4810 iorder[3] = 0;
4811 }
4812
4813 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004815#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4817 Py_UCS4 ch2 = *s;
4818 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4819 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4820 s++;
4821 size--;
4822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004823 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824#endif
4825 STORECHAR(ch);
4826 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004827
4828 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004829 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004830#undef STORECHAR
4831}
4832
Alexander Belopolsky40018472011-02-26 01:02:56 +00004833PyObject *
4834PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004835{
4836 if (!PyUnicode_Check(unicode)) {
4837 PyErr_BadArgument();
4838 return NULL;
4839 }
4840 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 PyUnicode_GET_SIZE(unicode),
4842 NULL,
4843 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004844}
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846/* --- UTF-16 Codec ------------------------------------------------------- */
4847
Tim Peters772747b2001-08-09 22:21:55 +00004848PyObject *
4849PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 Py_ssize_t size,
4851 const char *errors,
4852 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Walter Dörwald69652032004-09-07 20:24:22 +00004854 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4855}
4856
Antoine Pitrouab868312009-01-10 15:40:25 +00004857/* Two masks for fast checking of whether a C 'long' may contain
4858 UTF16-encoded surrogate characters. This is an efficient heuristic,
4859 assuming that non-surrogate characters with a code point >= 0x8000 are
4860 rare in most input.
4861 FAST_CHAR_MASK is used when the input is in native byte ordering,
4862 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004863*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004864#if (SIZEOF_LONG == 8)
4865# define FAST_CHAR_MASK 0x8000800080008000L
4866# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4867#elif (SIZEOF_LONG == 4)
4868# define FAST_CHAR_MASK 0x80008000L
4869# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4870#else
4871# error C 'long' size should be either 4 or 8!
4872#endif
4873
Walter Dörwald69652032004-09-07 20:24:22 +00004874PyObject *
4875PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 Py_ssize_t size,
4877 const char *errors,
4878 int *byteorder,
4879 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004880{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t startinpos;
4883 Py_ssize_t endinpos;
4884 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 PyUnicodeObject *unicode;
4886 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004887 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004888 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004889 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004890 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004891 /* Offsets from q for retrieving byte pairs in the right order. */
4892#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4893 int ihi = 1, ilo = 0;
4894#else
4895 int ihi = 0, ilo = 1;
4896#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 PyObject *errorHandler = NULL;
4898 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899
4900 /* Note: size will always be longer than the resulting Unicode
4901 character count */
4902 unicode = _PyUnicode_New(size);
4903 if (!unicode)
4904 return NULL;
4905 if (size == 0)
4906 return (PyObject *)unicode;
4907
4908 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004910 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004911 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912
4913 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004914 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004916 /* Check for BOM marks (U+FEFF) in the input and adjust current
4917 byte order setting accordingly. In native mode, the leading BOM
4918 mark is skipped, in all other modes, it is copied to the output
4919 stream as-is (giving a ZWNBSP character). */
4920 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004921 if (size >= 2) {
4922 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004923#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 if (bom == 0xFEFF) {
4925 q += 2;
4926 bo = -1;
4927 }
4928 else if (bom == 0xFFFE) {
4929 q += 2;
4930 bo = 1;
4931 }
Tim Petersced69f82003-09-16 20:30:58 +00004932#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 if (bom == 0xFEFF) {
4934 q += 2;
4935 bo = 1;
4936 }
4937 else if (bom == 0xFFFE) {
4938 q += 2;
4939 bo = -1;
4940 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
Tim Peters772747b2001-08-09 22:21:55 +00004945 if (bo == -1) {
4946 /* force LE */
4947 ihi = 1;
4948 ilo = 0;
4949 }
4950 else if (bo == 1) {
4951 /* force BE */
4952 ihi = 0;
4953 ilo = 1;
4954 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4956 native_ordering = ilo < ihi;
4957#else
4958 native_ordering = ilo > ihi;
4959#endif
Tim Peters772747b2001-08-09 22:21:55 +00004960
Antoine Pitrouab868312009-01-10 15:40:25 +00004961 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004962 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004964 /* First check for possible aligned read of a C 'long'. Unaligned
4965 reads are more expensive, better to defer to another iteration. */
4966 if (!((size_t) q & LONG_PTR_MASK)) {
4967 /* Fast path for runs of non-surrogate chars. */
4968 register const unsigned char *_q = q;
4969 Py_UNICODE *_p = p;
4970 if (native_ordering) {
4971 /* Native ordering is simple: as long as the input cannot
4972 possibly contain a surrogate char, do an unrolled copy
4973 of several 16-bit code points to the target object.
4974 The non-surrogate check is done on several input bytes
4975 at a time (as many as a C 'long' can contain). */
4976 while (_q < aligned_end) {
4977 unsigned long data = * (unsigned long *) _q;
4978 if (data & FAST_CHAR_MASK)
4979 break;
4980 _p[0] = ((unsigned short *) _q)[0];
4981 _p[1] = ((unsigned short *) _q)[1];
4982#if (SIZEOF_LONG == 8)
4983 _p[2] = ((unsigned short *) _q)[2];
4984 _p[3] = ((unsigned short *) _q)[3];
4985#endif
4986 _q += SIZEOF_LONG;
4987 _p += SIZEOF_LONG / 2;
4988 }
4989 }
4990 else {
4991 /* Byteswapped ordering is similar, but we must decompose
4992 the copy bytewise, and take care of zero'ing out the
4993 upper bytes if the target object is in 32-bit units
4994 (that is, in UCS-4 builds). */
4995 while (_q < aligned_end) {
4996 unsigned long data = * (unsigned long *) _q;
4997 if (data & SWAPPED_FAST_CHAR_MASK)
4998 break;
4999 /* Zero upper bytes in UCS-4 builds */
5000#if (Py_UNICODE_SIZE > 2)
5001 _p[0] = 0;
5002 _p[1] = 0;
5003#if (SIZEOF_LONG == 8)
5004 _p[2] = 0;
5005 _p[3] = 0;
5006#endif
5007#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005008 /* Issue #4916; UCS-4 builds on big endian machines must
5009 fill the two last bytes of each 4-byte unit. */
5010#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5011# define OFF 2
5012#else
5013# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005014#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005015 ((unsigned char *) _p)[OFF + 1] = _q[0];
5016 ((unsigned char *) _p)[OFF + 0] = _q[1];
5017 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5018 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5019#if (SIZEOF_LONG == 8)
5020 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5021 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5022 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5023 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5024#endif
5025#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005026 _q += SIZEOF_LONG;
5027 _p += SIZEOF_LONG / 2;
5028 }
5029 }
5030 p = _p;
5031 q = _q;
5032 if (q >= e)
5033 break;
5034 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036
Benjamin Peterson14339b62009-01-31 16:36:08 +00005037 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005038
5039 if (ch < 0xD800 || ch > 0xDFFF) {
5040 *p++ = ch;
5041 continue;
5042 }
5043
5044 /* UTF-16 code pair: */
5045 if (q > e) {
5046 errmsg = "unexpected end of data";
5047 startinpos = (((const char *)q) - 2) - starts;
5048 endinpos = ((const char *)e) + 1 - starts;
5049 goto utf16Error;
5050 }
5051 if (0xD800 <= ch && ch <= 0xDBFF) {
5052 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5053 q += 2;
5054 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005055#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 *p++ = ch;
5057 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005058#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005060#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 continue;
5062 }
5063 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005064 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 startinpos = (((const char *)q)-4)-starts;
5066 endinpos = startinpos+2;
5067 goto utf16Error;
5068 }
5069
Benjamin Peterson14339b62009-01-31 16:36:08 +00005070 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 errmsg = "illegal encoding";
5072 startinpos = (((const char *)q)-2)-starts;
5073 endinpos = startinpos+2;
5074 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005075
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 utf16Error:
5077 outpos = p - PyUnicode_AS_UNICODE(unicode);
5078 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005079 errors,
5080 &errorHandler,
5081 "utf16", errmsg,
5082 &starts,
5083 (const char **)&e,
5084 &startinpos,
5085 &endinpos,
5086 &exc,
5087 (const char **)&q,
5088 &unicode,
5089 &outpos,
5090 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005093 /* remaining byte at the end? (size should be even) */
5094 if (e == q) {
5095 if (!consumed) {
5096 errmsg = "truncated data";
5097 startinpos = ((const char *)q) - starts;
5098 endinpos = ((const char *)e) + 1 - starts;
5099 outpos = p - PyUnicode_AS_UNICODE(unicode);
5100 if (unicode_decode_call_errorhandler(
5101 errors,
5102 &errorHandler,
5103 "utf16", errmsg,
5104 &starts,
5105 (const char **)&e,
5106 &startinpos,
5107 &endinpos,
5108 &exc,
5109 (const char **)&q,
5110 &unicode,
5111 &outpos,
5112 &p))
5113 goto onError;
5114 /* The remaining input chars are ignored if the callback
5115 chooses to skip the input */
5116 }
5117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118
5119 if (byteorder)
5120 *byteorder = bo;
5121
Walter Dörwald69652032004-09-07 20:24:22 +00005122 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005126 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 goto onError;
5128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 Py_XDECREF(errorHandler);
5130 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005131 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005132 Py_DECREF(unicode);
5133 return NULL;
5134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 return (PyObject *)unicode;
5136
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 Py_XDECREF(errorHandler);
5140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 return NULL;
5142}
5143
Antoine Pitrouab868312009-01-10 15:40:25 +00005144#undef FAST_CHAR_MASK
5145#undef SWAPPED_FAST_CHAR_MASK
5146
Tim Peters772747b2001-08-09 22:21:55 +00005147PyObject *
5148PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 Py_ssize_t size,
5150 const char *errors,
5151 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005153 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005154 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005155 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005156#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005157 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005158#else
5159 const int pairs = 0;
5160#endif
Tim Peters772747b2001-08-09 22:21:55 +00005161 /* Offsets from p for storing byte pairs in the right order. */
5162#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5163 int ihi = 1, ilo = 0;
5164#else
5165 int ihi = 0, ilo = 1;
5166#endif
5167
Benjamin Peterson29060642009-01-31 22:14:21 +00005168#define STORECHAR(CH) \
5169 do { \
5170 p[ihi] = ((CH) >> 8) & 0xff; \
5171 p[ilo] = (CH) & 0xff; \
5172 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005173 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005175#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005176 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 if (s[i] >= 0x10000)
5178 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005179#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005180 /* 2 * (size + pairs + (byteorder == 0)) */
5181 if (size > PY_SSIZE_T_MAX ||
5182 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005184 nsize = size + pairs + (byteorder == 0);
5185 bytesize = nsize * 2;
5186 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005188 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 if (v == NULL)
5190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005192 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005195 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005196 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005197
5198 if (byteorder == -1) {
5199 /* force LE */
5200 ihi = 1;
5201 ilo = 0;
5202 }
5203 else if (byteorder == 1) {
5204 /* force BE */
5205 ihi = 0;
5206 ilo = 1;
5207 }
5208
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005209 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 Py_UNICODE ch = *s++;
5211 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005212#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 if (ch >= 0x10000) {
5214 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5215 ch = 0xD800 | ((ch-0x10000) >> 10);
5216 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005217#endif
Tim Peters772747b2001-08-09 22:21:55 +00005218 STORECHAR(ch);
5219 if (ch2)
5220 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005221 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005222
5223 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005224 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005225#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226}
5227
Alexander Belopolsky40018472011-02-26 01:02:56 +00005228PyObject *
5229PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
5231 if (!PyUnicode_Check(unicode)) {
5232 PyErr_BadArgument();
5233 return NULL;
5234 }
5235 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 PyUnicode_GET_SIZE(unicode),
5237 NULL,
5238 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239}
5240
5241/* --- Unicode Escape Codec ----------------------------------------------- */
5242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005243/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5244 if all the escapes in the string make it still a valid ASCII string.
5245 Returns -1 if any escapes were found which cause the string to
5246 pop out of ASCII range. Otherwise returns the length of the
5247 required buffer to hold the string.
5248 */
5249Py_ssize_t
5250length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5251{
5252 const unsigned char *p = (const unsigned char *)s;
5253 const unsigned char *end = p + size;
5254 Py_ssize_t length = 0;
5255
5256 if (size < 0)
5257 return -1;
5258
5259 for (; p < end; ++p) {
5260 if (*p > 127) {
5261 /* Non-ASCII */
5262 return -1;
5263 }
5264 else if (*p != '\\') {
5265 /* Normal character */
5266 ++length;
5267 }
5268 else {
5269 /* Backslash-escape, check next char */
5270 ++p;
5271 /* Escape sequence reaches till end of string or
5272 non-ASCII follow-up. */
5273 if (p >= end || *p > 127)
5274 return -1;
5275 switch (*p) {
5276 case '\n':
5277 /* backslash + \n result in zero characters */
5278 break;
5279 case '\\': case '\'': case '\"':
5280 case 'b': case 'f': case 't':
5281 case 'n': case 'r': case 'v': case 'a':
5282 ++length;
5283 break;
5284 case '0': case '1': case '2': case '3':
5285 case '4': case '5': case '6': case '7':
5286 case 'x': case 'u': case 'U': case 'N':
5287 /* these do not guarantee ASCII characters */
5288 return -1;
5289 default:
5290 /* count the backslash + the other character */
5291 length += 2;
5292 }
5293 }
5294 }
5295 return length;
5296}
5297
5298/* Similar to PyUnicode_WRITE but either write into wstr field
5299 or treat string as ASCII. */
5300#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5301 do { \
5302 if ((kind) != PyUnicode_WCHAR_KIND) \
5303 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5304 else \
5305 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5306 } while (0)
5307
5308#define WRITE_WSTR(buf, index, value) \
5309 assert(kind == PyUnicode_WCHAR_KIND), \
5310 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5311
5312
Fredrik Lundh06d12682001-01-24 07:59:11 +00005313static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005314
Alexander Belopolsky40018472011-02-26 01:02:56 +00005315PyObject *
5316PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005317 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005321 Py_ssize_t startinpos;
5322 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005323 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005327 char* message;
5328 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 PyObject *errorHandler = NULL;
5330 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005331 Py_ssize_t ascii_length;
5332 Py_ssize_t i;
5333 int kind;
5334 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005336 ascii_length = length_of_escaped_ascii_string(s, size);
5337
5338 /* After length_of_escaped_ascii_string() there are two alternatives,
5339 either the string is pure ASCII with named escapes like \n, etc.
5340 and we determined it's exact size (common case)
5341 or it contains \x, \u, ... escape sequences. then we create a
5342 legacy wchar string and resize it at the end of this function. */
5343 if (ascii_length >= 0) {
5344 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5345 if (!v)
5346 goto onError;
5347 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5348 kind = PyUnicode_1BYTE_KIND;
5349 data = PyUnicode_DATA(v);
5350 }
5351 else {
5352 /* Escaped strings will always be longer than the resulting
5353 Unicode string, so we start with size here and then reduce the
5354 length after conversion to the true value.
5355 (but if the error callback returns a long replacement string
5356 we'll have to allocate more space) */
5357 v = _PyUnicode_New(size);
5358 if (!v)
5359 goto onError;
5360 kind = PyUnicode_WCHAR_KIND;
5361 data = PyUnicode_AS_UNICODE(v);
5362 }
5363
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 if (size == 0)
5365 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005366 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 while (s < end) {
5370 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005371 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 if (kind == PyUnicode_WCHAR_KIND) {
5375 assert(i < _PyUnicode_WSTR_LENGTH(v));
5376 }
5377 else {
5378 /* The only case in which i == ascii_length is a backslash
5379 followed by a newline. */
5380 assert(i <= ascii_length);
5381 }
5382
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 /* Non-escape characters are interpreted as Unicode ordinals */
5384 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 continue;
5387 }
5388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 /* \ - Escapes */
5391 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005392 c = *s++;
5393 if (s > end)
5394 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005395
5396 if (kind == PyUnicode_WCHAR_KIND) {
5397 assert(i < _PyUnicode_WSTR_LENGTH(v));
5398 }
5399 else {
5400 /* The only case in which i == ascii_length is a backslash
5401 followed by a newline. */
5402 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5403 }
5404
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005405 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005409 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5410 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5411 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5412 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5413 /* FF */
5414 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5415 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5416 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5417 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5418 /* VT */
5419 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5420 /* BEL, not classic C */
5421 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 case '0': case '1': case '2': case '3':
5425 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005426 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005427 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005428 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005429 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005430 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005432 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 break;
5434
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 /* hex escapes */
5436 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005438 digits = 2;
5439 message = "truncated \\xXX escape";
5440 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005444 digits = 4;
5445 message = "truncated \\uXXXX escape";
5446 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005449 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005450 digits = 8;
5451 message = "truncated \\UXXXXXXXX escape";
5452 hexescape:
5453 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005454 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455 if (s+digits>end) {
5456 endinpos = size;
5457 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 errors, &errorHandler,
5459 "unicodeescape", "end of string in escape sequence",
5460 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 goto nextByte;
5465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 for (j = 0; j < digits; ++j) {
5467 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005468 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469 endinpos = (s+j+1)-starts;
5470 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 errors, &errorHandler,
5473 "unicodeescape", message,
5474 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005476 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005479 }
5480 chr = (chr<<4) & ~0xF;
5481 if (c >= '0' && c <= '9')
5482 chr += c - '0';
5483 else if (c >= 'a' && c <= 'f')
5484 chr += 10 + c - 'a';
5485 else
5486 chr += 10 + c - 'A';
5487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005489 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 /* _decoding_error will have already written into the
5491 target buffer. */
5492 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005493 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005494 /* when we get here, chr is a 32-bit unicode character */
5495 if (chr <= 0xffff)
5496 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005497 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005498 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005499 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005500 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005501#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005502 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005503#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005504 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5506 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005507#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005508 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 errors, &errorHandler,
5513 "unicodeescape", "illegal Unicode character",
5514 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005516 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519 break;
5520
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005522 case 'N':
5523 message = "malformed \\N character escape";
5524 if (ucnhash_CAPI == NULL) {
5525 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5527 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005528 if (ucnhash_CAPI == NULL)
5529 goto ucnhashError;
5530 }
5531 if (*s == '{') {
5532 const char *start = s+1;
5533 /* look for the closing brace */
5534 while (*s != '}' && s < end)
5535 s++;
5536 if (s > start && s < end && *s == '}') {
5537 /* found a name. look it up in the unicode database */
5538 message = "unknown Unicode character name";
5539 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5541 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005542 goto store;
5543 }
5544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 errors, &errorHandler,
5549 "unicodeescape", message,
5550 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005552 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 break;
5555
5556 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005557 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 message = "\\ at end of string";
5560 s--;
5561 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 errors, &errorHandler,
5565 "unicodeescape", message,
5566 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005568 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005570 }
5571 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5573 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005574 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005575 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580 /* Ensure the length prediction worked in case of ASCII strings */
5581 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5582
Victor Stinnerfe226c02011-10-03 03:52:20 +02005583 if (kind == PyUnicode_WCHAR_KIND)
5584 {
5585 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5586 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005587 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005590 if (_PyUnicode_READY_REPLACE(&v)) {
5591 Py_DECREF(v);
5592 return NULL;
5593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005595
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005597 PyErr_SetString(
5598 PyExc_UnicodeError,
5599 "\\N escapes not supported (can't load unicodedata module)"
5600 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005601 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 Py_XDECREF(errorHandler);
5603 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005604 return NULL;
5605
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 Py_XDECREF(errorHandler);
5609 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 return NULL;
5611}
5612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613#undef WRITE_ASCII_OR_WSTR
5614#undef WRITE_WSTR
5615
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616/* Return a Unicode-Escape string version of the Unicode object.
5617
5618 If quotes is true, the string is enclosed in u"" or u'' quotes as
5619 appropriate.
5620
5621*/
5622
Walter Dörwald79e913e2007-05-12 11:08:06 +00005623static const char *hexdigits = "0123456789abcdef";
5624
Alexander Belopolsky40018472011-02-26 01:02:56 +00005625PyObject *
5626PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005627 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005629 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005632#ifdef Py_UNICODE_WIDE
5633 const Py_ssize_t expandsize = 10;
5634#else
5635 const Py_ssize_t expandsize = 6;
5636#endif
5637
Thomas Wouters89f507f2006-12-13 04:49:30 +00005638 /* XXX(nnorwitz): rather than over-allocating, it would be
5639 better to choose a different scheme. Perhaps scan the
5640 first N-chars of the string and allocate based on that size.
5641 */
5642 /* Initial allocation is based on the longest-possible unichr
5643 escape.
5644
5645 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5646 unichr, so in this case it's the longest unichr escape. In
5647 narrow (UTF-16) builds this is five chars per source unichr
5648 since there are two unichrs in the surrogate pair, so in narrow
5649 (UTF-16) builds it's not the longest unichr escape.
5650
5651 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5652 so in the narrow (UTF-16) build case it's the longest unichr
5653 escape.
5654 */
5655
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005656 if (size == 0)
5657 return PyBytes_FromStringAndSize(NULL, 0);
5658
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005659 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005661
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005662 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 2
5664 + expandsize*size
5665 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 if (repr == NULL)
5667 return NULL;
5668
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005669 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 while (size-- > 0) {
5672 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005673
Walter Dörwald79e913e2007-05-12 11:08:06 +00005674 /* Escape backslashes */
5675 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 *p++ = '\\';
5677 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005678 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005679 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005680
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005681#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005682 /* Map 21-bit characters to '\U00xxxxxx' */
5683 else if (ch >= 0x10000) {
5684 *p++ = '\\';
5685 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005686 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5687 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5688 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5689 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5690 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5691 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5692 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5693 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005695 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005696#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5698 else if (ch >= 0xD800 && ch < 0xDC00) {
5699 Py_UNICODE ch2;
5700 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005701
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 ch2 = *s++;
5703 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005704 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5706 *p++ = '\\';
5707 *p++ = 'U';
5708 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5709 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5710 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5711 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5712 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5713 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5714 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5715 *p++ = hexdigits[ucs & 0x0000000F];
5716 continue;
5717 }
5718 /* Fall through: isolated surrogates are copied as-is */
5719 s--;
5720 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005721 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005722#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005723
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005725 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 *p++ = '\\';
5727 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005728 *p++ = hexdigits[(ch >> 12) & 0x000F];
5729 *p++ = hexdigits[(ch >> 8) & 0x000F];
5730 *p++ = hexdigits[(ch >> 4) & 0x000F];
5731 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005733
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005734 /* Map special whitespace to '\t', \n', '\r' */
5735 else if (ch == '\t') {
5736 *p++ = '\\';
5737 *p++ = 't';
5738 }
5739 else if (ch == '\n') {
5740 *p++ = '\\';
5741 *p++ = 'n';
5742 }
5743 else if (ch == '\r') {
5744 *p++ = '\\';
5745 *p++ = 'r';
5746 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005747
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005748 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005749 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005751 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005752 *p++ = hexdigits[(ch >> 4) & 0x000F];
5753 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005755
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 /* Copy everything else as-is */
5757 else
5758 *p++ = (char) ch;
5759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005761 assert(p - PyBytes_AS_STRING(repr) > 0);
5762 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5763 return NULL;
5764 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
Alexander Belopolsky40018472011-02-26 01:02:56 +00005767PyObject *
5768PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005770 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 if (!PyUnicode_Check(unicode)) {
5772 PyErr_BadArgument();
5773 return NULL;
5774 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005775 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5776 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005777 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778}
5779
5780/* --- Raw Unicode Escape Codec ------------------------------------------- */
5781
Alexander Belopolsky40018472011-02-26 01:02:56 +00005782PyObject *
5783PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005784 Py_ssize_t size,
5785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 Py_ssize_t startinpos;
5789 Py_ssize_t endinpos;
5790 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 const char *end;
5794 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 PyObject *errorHandler = NULL;
5796 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 /* Escaped strings will always be longer than the resulting
5799 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 length after conversion to the true value. (But decoding error
5801 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 v = _PyUnicode_New(size);
5803 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 end = s + size;
5809 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 unsigned char c;
5811 Py_UCS4 x;
5812 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005813 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 /* Non-escape characters are interpreted as Unicode ordinals */
5816 if (*s != '\\') {
5817 *p++ = (unsigned char)*s++;
5818 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 startinpos = s-starts;
5821
5822 /* \u-escapes are only interpreted iff the number of leading
5823 backslashes if odd */
5824 bs = s;
5825 for (;s < end;) {
5826 if (*s != '\\')
5827 break;
5828 *p++ = (unsigned char)*s++;
5829 }
5830 if (((s - bs) & 1) == 0 ||
5831 s >= end ||
5832 (*s != 'u' && *s != 'U')) {
5833 continue;
5834 }
5835 p--;
5836 count = *s=='u' ? 4 : 8;
5837 s++;
5838
5839 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5840 outpos = p-PyUnicode_AS_UNICODE(v);
5841 for (x = 0, i = 0; i < count; ++i, ++s) {
5842 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005843 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 endinpos = s-starts;
5845 if (unicode_decode_call_errorhandler(
5846 errors, &errorHandler,
5847 "rawunicodeescape", "truncated \\uXXXX",
5848 &starts, &end, &startinpos, &endinpos, &exc, &s,
5849 &v, &outpos, &p))
5850 goto onError;
5851 goto nextByte;
5852 }
5853 x = (x<<4) & ~0xF;
5854 if (c >= '0' && c <= '9')
5855 x += c - '0';
5856 else if (c >= 'a' && c <= 'f')
5857 x += 10 + c - 'a';
5858 else
5859 x += 10 + c - 'A';
5860 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005861 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 /* UCS-2 character */
5863 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005864 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* UCS-4 character. Either store directly, or as
5866 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005867#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005869#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 x -= 0x10000L;
5871 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5872 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005873#endif
5874 } else {
5875 endinpos = s-starts;
5876 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005877 if (unicode_decode_call_errorhandler(
5878 errors, &errorHandler,
5879 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 &starts, &end, &startinpos, &endinpos, &exc, &s,
5881 &v, &outpos, &p))
5882 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 nextByte:
5885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005887 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005891 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005892 Py_DECREF(v);
5893 return NULL;
5894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005896
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 Py_XDECREF(errorHandler);
5900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 return NULL;
5902}
5903
Alexander Belopolsky40018472011-02-26 01:02:56 +00005904PyObject *
5905PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005906 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 char *p;
5910 char *q;
5911
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005912#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005913 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005914#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005915 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005916#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005917
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005918 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005920
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005921 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 if (repr == NULL)
5923 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005924 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005927 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 while (size-- > 0) {
5929 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005930#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 /* Map 32-bit characters to '\Uxxxxxxxx' */
5932 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005933 *p++ = '\\';
5934 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005935 *p++ = hexdigits[(ch >> 28) & 0xf];
5936 *p++ = hexdigits[(ch >> 24) & 0xf];
5937 *p++ = hexdigits[(ch >> 20) & 0xf];
5938 *p++ = hexdigits[(ch >> 16) & 0xf];
5939 *p++ = hexdigits[(ch >> 12) & 0xf];
5940 *p++ = hexdigits[(ch >> 8) & 0xf];
5941 *p++ = hexdigits[(ch >> 4) & 0xf];
5942 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005943 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005944 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005945#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5947 if (ch >= 0xD800 && ch < 0xDC00) {
5948 Py_UNICODE ch2;
5949 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 ch2 = *s++;
5952 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005953 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5955 *p++ = '\\';
5956 *p++ = 'U';
5957 *p++ = hexdigits[(ucs >> 28) & 0xf];
5958 *p++ = hexdigits[(ucs >> 24) & 0xf];
5959 *p++ = hexdigits[(ucs >> 20) & 0xf];
5960 *p++ = hexdigits[(ucs >> 16) & 0xf];
5961 *p++ = hexdigits[(ucs >> 12) & 0xf];
5962 *p++ = hexdigits[(ucs >> 8) & 0xf];
5963 *p++ = hexdigits[(ucs >> 4) & 0xf];
5964 *p++ = hexdigits[ucs & 0xf];
5965 continue;
5966 }
5967 /* Fall through: isolated surrogates are copied as-is */
5968 s--;
5969 size++;
5970 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005971#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 /* Map 16-bit characters to '\uxxxx' */
5973 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 *p++ = '\\';
5975 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005976 *p++ = hexdigits[(ch >> 12) & 0xf];
5977 *p++ = hexdigits[(ch >> 8) & 0xf];
5978 *p++ = hexdigits[(ch >> 4) & 0xf];
5979 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* Copy everything else as-is */
5982 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 *p++ = (char) ch;
5984 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005985 size = p - q;
5986
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005987 assert(size > 0);
5988 if (_PyBytes_Resize(&repr, size) < 0)
5989 return NULL;
5990 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991}
5992
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993PyObject *
5994PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005996 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005998 PyErr_BadArgument();
5999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006001 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6002 PyUnicode_GET_SIZE(unicode));
6003
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006004 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005}
6006
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006007/* --- Unicode Internal Codec ------------------------------------------- */
6008
Alexander Belopolsky40018472011-02-26 01:02:56 +00006009PyObject *
6010_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006011 Py_ssize_t size,
6012 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006013{
6014 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 Py_ssize_t startinpos;
6016 Py_ssize_t endinpos;
6017 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006018 PyUnicodeObject *v;
6019 Py_UNICODE *p;
6020 const char *end;
6021 const char *reason;
6022 PyObject *errorHandler = NULL;
6023 PyObject *exc = NULL;
6024
Neal Norwitzd43069c2006-01-08 01:12:10 +00006025#ifdef Py_UNICODE_WIDE
6026 Py_UNICODE unimax = PyUnicode_GetMax();
6027#endif
6028
Thomas Wouters89f507f2006-12-13 04:49:30 +00006029 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6031 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006033 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6034 as string was created with the old API. */
6035 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037 p = PyUnicode_AS_UNICODE(v);
6038 end = s + size;
6039
6040 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006042 /* We have to sanity check the raw data, otherwise doom looms for
6043 some malformed UCS-4 data. */
6044 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006045#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006047#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006048 end-s < Py_UNICODE_SIZE
6049 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006051 startinpos = s - starts;
6052 if (end-s < Py_UNICODE_SIZE) {
6053 endinpos = end-starts;
6054 reason = "truncated input";
6055 }
6056 else {
6057 endinpos = s - starts + Py_UNICODE_SIZE;
6058 reason = "illegal code point (> 0x10FFFF)";
6059 }
6060 outpos = p - PyUnicode_AS_UNICODE(v);
6061 if (unicode_decode_call_errorhandler(
6062 errors, &errorHandler,
6063 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006064 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006065 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006066 goto onError;
6067 }
6068 }
6069 else {
6070 p++;
6071 s += Py_UNICODE_SIZE;
6072 }
6073 }
6074
Victor Stinnerfe226c02011-10-03 03:52:20 +02006075 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006076 goto onError;
6077 Py_XDECREF(errorHandler);
6078 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006079 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006080 Py_DECREF(v);
6081 return NULL;
6082 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006083 return (PyObject *)v;
6084
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006086 Py_XDECREF(v);
6087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
6089 return NULL;
6090}
6091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092/* --- Latin-1 Codec ------------------------------------------------------ */
6093
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094PyObject *
6095PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 Py_ssize_t size,
6097 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006100 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101}
6102
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006104static void
6105make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006106 const char *encoding,
6107 const Py_UNICODE *unicode, Py_ssize_t size,
6108 Py_ssize_t startpos, Py_ssize_t endpos,
6109 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 *exceptionObject = PyUnicodeEncodeError_Create(
6113 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
6115 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6117 goto onError;
6118 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6119 goto onError;
6120 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6121 goto onError;
6122 return;
6123 onError:
6124 Py_DECREF(*exceptionObject);
6125 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 }
6127}
6128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006130static void
6131raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006132 const char *encoding,
6133 const Py_UNICODE *unicode, Py_ssize_t size,
6134 Py_ssize_t startpos, Py_ssize_t endpos,
6135 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136{
6137 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141}
6142
6143/* error handling callback helper:
6144 build arguments, call the callback and check the arguments,
6145 put the result into newpos and return the replacement string, which
6146 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147static PyObject *
6148unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006149 PyObject **errorHandler,
6150 const char *encoding, const char *reason,
6151 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6152 Py_ssize_t startpos, Py_ssize_t endpos,
6153 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006155 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156
6157 PyObject *restuple;
6158 PyObject *resunicode;
6159
6160 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 }
6165
6166 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170
6171 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006176 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 Py_DECREF(restuple);
6178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006180 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 &resunicode, newpos)) {
6182 Py_DECREF(restuple);
6183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006185 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6186 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6187 Py_DECREF(restuple);
6188 return NULL;
6189 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006192 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6194 Py_DECREF(restuple);
6195 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006196 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197 Py_INCREF(resunicode);
6198 Py_DECREF(restuple);
6199 return resunicode;
6200}
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202static PyObject *
6203unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006204 Py_ssize_t size,
6205 const char *errors,
6206 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207{
6208 /* output object */
6209 PyObject *res;
6210 /* pointers to the beginning and end+1 of input */
6211 const Py_UNICODE *startp = p;
6212 const Py_UNICODE *endp = p + size;
6213 /* pointer to the beginning of the unencodable characters */
6214 /* const Py_UNICODE *badp = NULL; */
6215 /* pointer into the output */
6216 char *str;
6217 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006219 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6220 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 PyObject *errorHandler = NULL;
6222 PyObject *exc = NULL;
6223 /* the following variable is used for caching string comparisons
6224 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6225 int known_errorHandler = -1;
6226
6227 /* allocate enough for a simple encoding without
6228 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006229 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006230 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006231 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006232 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006233 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006234 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006235 ressize = size;
6236
6237 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* can we encode this? */
6241 if (c<limit) {
6242 /* no overflow check, because we know that the space is enough */
6243 *str++ = (char)c;
6244 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006245 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 else {
6247 Py_ssize_t unicodepos = p-startp;
6248 Py_ssize_t requiredsize;
6249 PyObject *repunicode;
6250 Py_ssize_t repsize;
6251 Py_ssize_t newpos;
6252 Py_ssize_t respos;
6253 Py_UNICODE *uni2;
6254 /* startpos for collecting unencodable chars */
6255 const Py_UNICODE *collstart = p;
6256 const Py_UNICODE *collend = p;
6257 /* find all unecodable characters */
6258 while ((collend < endp) && ((*collend)>=limit))
6259 ++collend;
6260 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6261 if (known_errorHandler==-1) {
6262 if ((errors==NULL) || (!strcmp(errors, "strict")))
6263 known_errorHandler = 1;
6264 else if (!strcmp(errors, "replace"))
6265 known_errorHandler = 2;
6266 else if (!strcmp(errors, "ignore"))
6267 known_errorHandler = 3;
6268 else if (!strcmp(errors, "xmlcharrefreplace"))
6269 known_errorHandler = 4;
6270 else
6271 known_errorHandler = 0;
6272 }
6273 switch (known_errorHandler) {
6274 case 1: /* strict */
6275 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6276 goto onError;
6277 case 2: /* replace */
6278 while (collstart++<collend)
6279 *str++ = '?'; /* fall through */
6280 case 3: /* ignore */
6281 p = collend;
6282 break;
6283 case 4: /* xmlcharrefreplace */
6284 respos = str - PyBytes_AS_STRING(res);
6285 /* determine replacement size (temporarily (mis)uses p) */
6286 for (p = collstart, repsize = 0; p < collend; ++p) {
6287 if (*p<10)
6288 repsize += 2+1+1;
6289 else if (*p<100)
6290 repsize += 2+2+1;
6291 else if (*p<1000)
6292 repsize += 2+3+1;
6293 else if (*p<10000)
6294 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006295#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 else
6297 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006298#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 else if (*p<100000)
6300 repsize += 2+5+1;
6301 else if (*p<1000000)
6302 repsize += 2+6+1;
6303 else
6304 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006305#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 }
6307 requiredsize = respos+repsize+(endp-collend);
6308 if (requiredsize > ressize) {
6309 if (requiredsize<2*ressize)
6310 requiredsize = 2*ressize;
6311 if (_PyBytes_Resize(&res, requiredsize))
6312 goto onError;
6313 str = PyBytes_AS_STRING(res) + respos;
6314 ressize = requiredsize;
6315 }
6316 /* generate replacement (temporarily (mis)uses p) */
6317 for (p = collstart; p < collend; ++p) {
6318 str += sprintf(str, "&#%d;", (int)*p);
6319 }
6320 p = collend;
6321 break;
6322 default:
6323 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6324 encoding, reason, startp, size, &exc,
6325 collstart-startp, collend-startp, &newpos);
6326 if (repunicode == NULL)
6327 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006328 if (PyBytes_Check(repunicode)) {
6329 /* Directly copy bytes result to output. */
6330 repsize = PyBytes_Size(repunicode);
6331 if (repsize > 1) {
6332 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006333 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006334 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6335 Py_DECREF(repunicode);
6336 goto onError;
6337 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006338 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006339 ressize += repsize-1;
6340 }
6341 memcpy(str, PyBytes_AsString(repunicode), repsize);
6342 str += repsize;
6343 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006344 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006345 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 /* need more space? (at least enough for what we
6348 have+the replacement+the rest of the string, so
6349 we won't have to check space for encodable characters) */
6350 respos = str - PyBytes_AS_STRING(res);
6351 repsize = PyUnicode_GET_SIZE(repunicode);
6352 requiredsize = respos+repsize+(endp-collend);
6353 if (requiredsize > ressize) {
6354 if (requiredsize<2*ressize)
6355 requiredsize = 2*ressize;
6356 if (_PyBytes_Resize(&res, requiredsize)) {
6357 Py_DECREF(repunicode);
6358 goto onError;
6359 }
6360 str = PyBytes_AS_STRING(res) + respos;
6361 ressize = requiredsize;
6362 }
6363 /* check if there is anything unencodable in the replacement
6364 and copy it to the output */
6365 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6366 c = *uni2;
6367 if (c >= limit) {
6368 raise_encode_exception(&exc, encoding, startp, size,
6369 unicodepos, unicodepos+1, reason);
6370 Py_DECREF(repunicode);
6371 goto onError;
6372 }
6373 *str = (char)c;
6374 }
6375 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006376 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006378 }
6379 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006380 /* Resize if we allocated to much */
6381 size = str - PyBytes_AS_STRING(res);
6382 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006383 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006384 if (_PyBytes_Resize(&res, size) < 0)
6385 goto onError;
6386 }
6387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006390 return res;
6391
6392 onError:
6393 Py_XDECREF(res);
6394 Py_XDECREF(errorHandler);
6395 Py_XDECREF(exc);
6396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397}
6398
Alexander Belopolsky40018472011-02-26 01:02:56 +00006399PyObject *
6400PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006401 Py_ssize_t size,
6402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409{
6410 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 PyErr_BadArgument();
6412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006414 if (PyUnicode_READY(unicode) == -1)
6415 return NULL;
6416 /* Fast path: if it is a one-byte string, construct
6417 bytes object directly. */
6418 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6419 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6420 PyUnicode_GET_LENGTH(unicode));
6421 /* Non-Latin-1 characters present. Defer to above function to
6422 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425 errors);
6426}
6427
6428PyObject*
6429PyUnicode_AsLatin1String(PyObject *unicode)
6430{
6431 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432}
6433
6434/* --- 7-bit ASCII Codec -------------------------------------------------- */
6435
Alexander Belopolsky40018472011-02-26 01:02:56 +00006436PyObject *
6437PyUnicode_DecodeASCII(const char *s,
6438 Py_ssize_t size,
6439 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 PyUnicodeObject *v;
6443 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006444 Py_ssize_t startinpos;
6445 Py_ssize_t endinpos;
6446 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006448 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 PyObject *errorHandler = NULL;
6450 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006451 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006454 if (size == 1 && *(unsigned char*)s < 128)
6455 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6456
6457 /* Fast path. Assume the input actually *is* ASCII, and allocate
6458 a single-block Unicode object with that assumption. If there is
6459 an error, drop the object and start over. */
6460 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6461 if (v == NULL)
6462 goto onError;
6463 d = PyUnicode_1BYTE_DATA(v);
6464 for (i = 0; i < size; i++) {
6465 unsigned char ch = ((unsigned char*)s)[i];
6466 if (ch < 128)
6467 d[i] = ch;
6468 else
6469 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006471 if (i == size)
6472 return (PyObject*)v;
6473 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006474
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 v = _PyUnicode_New(size);
6476 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 e = s + size;
6482 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 register unsigned char c = (unsigned char)*s;
6484 if (c < 128) {
6485 *p++ = c;
6486 ++s;
6487 }
6488 else {
6489 startinpos = s-starts;
6490 endinpos = startinpos + 1;
6491 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6492 if (unicode_decode_call_errorhandler(
6493 errors, &errorHandler,
6494 "ascii", "ordinal not in range(128)",
6495 &starts, &e, &startinpos, &endinpos, &exc, &s,
6496 &v, &outpos, &p))
6497 goto onError;
6498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006500 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006501 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503 Py_XDECREF(errorHandler);
6504 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006505 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506 Py_DECREF(v);
6507 return NULL;
6508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006510
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 Py_XDECREF(errorHandler);
6514 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 return NULL;
6516}
6517
Alexander Belopolsky40018472011-02-26 01:02:56 +00006518PyObject *
6519PyUnicode_EncodeASCII(const Py_UNICODE *p,
6520 Py_ssize_t size,
6521 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
Alexander Belopolsky40018472011-02-26 01:02:56 +00006526PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006527_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
6529 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 PyErr_BadArgument();
6531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533 if (PyUnicode_READY(unicode) == -1)
6534 return NULL;
6535 /* Fast path: if it is an ASCII-only string, construct bytes object
6536 directly. Else defer to above function to raise the exception. */
6537 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6538 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6539 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006542 errors);
6543}
6544
6545PyObject *
6546PyUnicode_AsASCIIString(PyObject *unicode)
6547{
6548 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
Victor Stinner99b95382011-07-04 14:23:54 +02006551#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006552
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006553/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006554
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006555#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006556#define NEED_RETRY
6557#endif
6558
6559/* XXX This code is limited to "true" double-byte encodings, as
6560 a) it assumes an incomplete character consists of a single byte, and
6561 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006563
Alexander Belopolsky40018472011-02-26 01:02:56 +00006564static int
6565is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006566{
6567 const char *curr = s + offset;
6568
6569 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 const char *prev = CharPrev(s, curr);
6571 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006572 }
6573 return 0;
6574}
6575
6576/*
6577 * Decode MBCS string into unicode object. If 'final' is set, converts
6578 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6579 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006580static int
6581decode_mbcs(PyUnicodeObject **v,
6582 const char *s, /* MBCS string */
6583 int size, /* sizeof MBCS string */
6584 int final,
6585 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586{
6587 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006588 Py_ssize_t n;
6589 DWORD usize;
6590 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006591
6592 assert(size >= 0);
6593
Victor Stinner554f3f02010-06-16 23:33:54 +00006594 /* check and handle 'errors' arg */
6595 if (errors==NULL || strcmp(errors, "strict")==0)
6596 flags = MB_ERR_INVALID_CHARS;
6597 else if (strcmp(errors, "ignore")==0)
6598 flags = 0;
6599 else {
6600 PyErr_Format(PyExc_ValueError,
6601 "mbcs encoding does not support errors='%s'",
6602 errors);
6603 return -1;
6604 }
6605
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606 /* Skip trailing lead-byte unless 'final' is set */
6607 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609
6610 /* First get the size of the result */
6611 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006612 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6613 if (usize==0)
6614 goto mbcs_decode_error;
6615 } else
6616 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617
6618 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* Create unicode object */
6620 *v = _PyUnicode_New(usize);
6621 if (*v == NULL)
6622 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006623 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006624 }
6625 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 /* Extend unicode object */
6627 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006628 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006630 }
6631
6632 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006633 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006635 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6636 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006640
6641mbcs_decode_error:
6642 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6643 we raise a UnicodeDecodeError - else it is a 'generic'
6644 windows error
6645 */
6646 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6647 /* Ideally, we should get reason from FormatMessage - this
6648 is the Windows 2000 English version of the message
6649 */
6650 PyObject *exc = NULL;
6651 const char *reason = "No mapping for the Unicode character exists "
6652 "in the target multi-byte code page.";
6653 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6654 if (exc != NULL) {
6655 PyCodec_StrictErrors(exc);
6656 Py_DECREF(exc);
6657 }
6658 } else {
6659 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6660 }
6661 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664PyObject *
6665PyUnicode_DecodeMBCSStateful(const char *s,
6666 Py_ssize_t size,
6667 const char *errors,
6668 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006669{
6670 PyUnicodeObject *v = NULL;
6671 int done;
6672
6673 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006675
6676#ifdef NEED_RETRY
6677 retry:
6678 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006679 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006680 else
6681#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006682 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683
6684 if (done < 0) {
6685 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006687 }
6688
6689 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691
6692#ifdef NEED_RETRY
6693 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 s += done;
6695 size -= done;
6696 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 }
6698#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006699 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 Py_DECREF(v);
6701 return NULL;
6702 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 return (PyObject *)v;
6704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706PyObject *
6707PyUnicode_DecodeMBCS(const char *s,
6708 Py_ssize_t size,
6709 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006710{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6712}
6713
6714/*
6715 * Convert unicode into string object (MBCS).
6716 * Returns 0 if succeed, -1 otherwise.
6717 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718static int
6719encode_mbcs(PyObject **repr,
6720 const Py_UNICODE *p, /* unicode */
6721 int size, /* size of unicode */
6722 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006723{
Victor Stinner554f3f02010-06-16 23:33:54 +00006724 BOOL usedDefaultChar = FALSE;
6725 BOOL *pusedDefaultChar;
6726 int mbcssize;
6727 Py_ssize_t n;
6728 PyObject *exc = NULL;
6729 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730
6731 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006732
Victor Stinner554f3f02010-06-16 23:33:54 +00006733 /* check and handle 'errors' arg */
6734 if (errors==NULL || strcmp(errors, "strict")==0) {
6735 flags = WC_NO_BEST_FIT_CHARS;
6736 pusedDefaultChar = &usedDefaultChar;
6737 } else if (strcmp(errors, "replace")==0) {
6738 flags = 0;
6739 pusedDefaultChar = NULL;
6740 } else {
6741 PyErr_Format(PyExc_ValueError,
6742 "mbcs encoding does not support errors='%s'",
6743 errors);
6744 return -1;
6745 }
6746
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006747 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006748 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006749 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6750 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 if (mbcssize == 0) {
6752 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6753 return -1;
6754 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006755 /* If we used a default char, then we failed! */
6756 if (pusedDefaultChar && *pusedDefaultChar)
6757 goto mbcs_encode_error;
6758 } else {
6759 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006760 }
6761
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* Create string object */
6764 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6765 if (*repr == NULL)
6766 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006767 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768 }
6769 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* Extend string object */
6771 n = PyBytes_Size(*repr);
6772 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6773 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
6775
6776 /* Do the conversion */
6777 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6780 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6782 return -1;
6783 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006784 if (pusedDefaultChar && *pusedDefaultChar)
6785 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006788
6789mbcs_encode_error:
6790 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6791 Py_XDECREF(exc);
6792 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793}
6794
Alexander Belopolsky40018472011-02-26 01:02:56 +00006795PyObject *
6796PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6797 Py_ssize_t size,
6798 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006799{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800 PyObject *repr = NULL;
6801 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006806 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006807 else
6808#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006809 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006810
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 Py_XDECREF(repr);
6813 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006814 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815
6816#ifdef NEED_RETRY
6817 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 p += INT_MAX;
6819 size -= INT_MAX;
6820 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821 }
6822#endif
6823
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006824 return repr;
6825}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006826
Alexander Belopolsky40018472011-02-26 01:02:56 +00006827PyObject *
6828PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006829{
6830 if (!PyUnicode_Check(unicode)) {
6831 PyErr_BadArgument();
6832 return NULL;
6833 }
6834 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 PyUnicode_GET_SIZE(unicode),
6836 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006837}
6838
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839#undef NEED_RETRY
6840
Victor Stinner99b95382011-07-04 14:23:54 +02006841#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843/* --- Character Mapping Codec -------------------------------------------- */
6844
Alexander Belopolsky40018472011-02-26 01:02:56 +00006845PyObject *
6846PyUnicode_DecodeCharmap(const char *s,
6847 Py_ssize_t size,
6848 PyObject *mapping,
6849 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006852 Py_ssize_t startinpos;
6853 Py_ssize_t endinpos;
6854 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 PyUnicodeObject *v;
6857 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006858 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 PyObject *errorHandler = NULL;
6860 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006861 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006862 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006863
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 /* Default to Latin-1 */
6865 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868 v = _PyUnicode_New(size);
6869 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006875 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 mapstring = PyUnicode_AS_UNICODE(mapping);
6877 maplen = PyUnicode_GET_SIZE(mapping);
6878 while (s < e) {
6879 unsigned char ch = *s;
6880 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 if (ch < maplen)
6883 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 if (x == 0xfffe) {
6886 /* undefined mapping */
6887 outpos = p-PyUnicode_AS_UNICODE(v);
6888 startinpos = s-starts;
6889 endinpos = startinpos+1;
6890 if (unicode_decode_call_errorhandler(
6891 errors, &errorHandler,
6892 "charmap", "character maps to <undefined>",
6893 &starts, &e, &startinpos, &endinpos, &exc, &s,
6894 &v, &outpos, &p)) {
6895 goto onError;
6896 }
6897 continue;
6898 }
6899 *p++ = x;
6900 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006902 }
6903 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 while (s < e) {
6905 unsigned char ch = *s;
6906 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006907
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6909 w = PyLong_FromLong((long)ch);
6910 if (w == NULL)
6911 goto onError;
6912 x = PyObject_GetItem(mapping, w);
6913 Py_DECREF(w);
6914 if (x == NULL) {
6915 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6916 /* No mapping found means: mapping is undefined. */
6917 PyErr_Clear();
6918 x = Py_None;
6919 Py_INCREF(x);
6920 } else
6921 goto onError;
6922 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 /* Apply mapping */
6925 if (PyLong_Check(x)) {
6926 long value = PyLong_AS_LONG(x);
6927 if (value < 0 || value > 65535) {
6928 PyErr_SetString(PyExc_TypeError,
6929 "character mapping must be in range(65536)");
6930 Py_DECREF(x);
6931 goto onError;
6932 }
6933 *p++ = (Py_UNICODE)value;
6934 }
6935 else if (x == Py_None) {
6936 /* undefined mapping */
6937 outpos = p-PyUnicode_AS_UNICODE(v);
6938 startinpos = s-starts;
6939 endinpos = startinpos+1;
6940 if (unicode_decode_call_errorhandler(
6941 errors, &errorHandler,
6942 "charmap", "character maps to <undefined>",
6943 &starts, &e, &startinpos, &endinpos, &exc, &s,
6944 &v, &outpos, &p)) {
6945 Py_DECREF(x);
6946 goto onError;
6947 }
6948 Py_DECREF(x);
6949 continue;
6950 }
6951 else if (PyUnicode_Check(x)) {
6952 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006953
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 if (targetsize == 1)
6955 /* 1-1 mapping */
6956 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006957
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 else if (targetsize > 1) {
6959 /* 1-n mapping */
6960 if (targetsize > extrachars) {
6961 /* resize first */
6962 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6963 Py_ssize_t needed = (targetsize - extrachars) + \
6964 (targetsize << 2);
6965 extrachars += needed;
6966 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006967 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 PyUnicode_GET_SIZE(v) + needed) < 0) {
6969 Py_DECREF(x);
6970 goto onError;
6971 }
6972 p = PyUnicode_AS_UNICODE(v) + oldpos;
6973 }
6974 Py_UNICODE_COPY(p,
6975 PyUnicode_AS_UNICODE(x),
6976 targetsize);
6977 p += targetsize;
6978 extrachars -= targetsize;
6979 }
6980 /* 1-0 mapping: skip the character */
6981 }
6982 else {
6983 /* wrong return value */
6984 PyErr_SetString(PyExc_TypeError,
6985 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986 Py_DECREF(x);
6987 goto onError;
6988 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 Py_DECREF(x);
6990 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
6993 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006994 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996 Py_XDECREF(errorHandler);
6997 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006998 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006999 Py_DECREF(v);
7000 return NULL;
7001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007003
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 Py_XDECREF(errorHandler);
7006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 Py_XDECREF(v);
7008 return NULL;
7009}
7010
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007011/* Charmap encoding: the lookup table */
7012
Alexander Belopolsky40018472011-02-26 01:02:56 +00007013struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 PyObject_HEAD
7015 unsigned char level1[32];
7016 int count2, count3;
7017 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007018};
7019
7020static PyObject*
7021encoding_map_size(PyObject *obj, PyObject* args)
7022{
7023 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007026}
7027
7028static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007029 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 PyDoc_STR("Return the size (in bytes) of this object") },
7031 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007032};
7033
7034static void
7035encoding_map_dealloc(PyObject* o)
7036{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007037 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007038}
7039
7040static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 "EncodingMap", /*tp_name*/
7043 sizeof(struct encoding_map), /*tp_basicsize*/
7044 0, /*tp_itemsize*/
7045 /* methods */
7046 encoding_map_dealloc, /*tp_dealloc*/
7047 0, /*tp_print*/
7048 0, /*tp_getattr*/
7049 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007050 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 0, /*tp_repr*/
7052 0, /*tp_as_number*/
7053 0, /*tp_as_sequence*/
7054 0, /*tp_as_mapping*/
7055 0, /*tp_hash*/
7056 0, /*tp_call*/
7057 0, /*tp_str*/
7058 0, /*tp_getattro*/
7059 0, /*tp_setattro*/
7060 0, /*tp_as_buffer*/
7061 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7062 0, /*tp_doc*/
7063 0, /*tp_traverse*/
7064 0, /*tp_clear*/
7065 0, /*tp_richcompare*/
7066 0, /*tp_weaklistoffset*/
7067 0, /*tp_iter*/
7068 0, /*tp_iternext*/
7069 encoding_map_methods, /*tp_methods*/
7070 0, /*tp_members*/
7071 0, /*tp_getset*/
7072 0, /*tp_base*/
7073 0, /*tp_dict*/
7074 0, /*tp_descr_get*/
7075 0, /*tp_descr_set*/
7076 0, /*tp_dictoffset*/
7077 0, /*tp_init*/
7078 0, /*tp_alloc*/
7079 0, /*tp_new*/
7080 0, /*tp_free*/
7081 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007082};
7083
7084PyObject*
7085PyUnicode_BuildEncodingMap(PyObject* string)
7086{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007087 PyObject *result;
7088 struct encoding_map *mresult;
7089 int i;
7090 int need_dict = 0;
7091 unsigned char level1[32];
7092 unsigned char level2[512];
7093 unsigned char *mlevel1, *mlevel2, *mlevel3;
7094 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007095 int kind;
7096 void *data;
7097 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007099 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007100 PyErr_BadArgument();
7101 return NULL;
7102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 kind = PyUnicode_KIND(string);
7104 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007105 memset(level1, 0xFF, sizeof level1);
7106 memset(level2, 0xFF, sizeof level2);
7107
7108 /* If there isn't a one-to-one mapping of NULL to \0,
7109 or if there are non-BMP characters, we need to use
7110 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007111 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007112 need_dict = 1;
7113 for (i = 1; i < 256; i++) {
7114 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115 ch = PyUnicode_READ(kind, data, i);
7116 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007117 need_dict = 1;
7118 break;
7119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007121 /* unmapped character */
7122 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007123 l1 = ch >> 11;
7124 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007125 if (level1[l1] == 0xFF)
7126 level1[l1] = count2++;
7127 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007128 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007129 }
7130
7131 if (count2 >= 0xFF || count3 >= 0xFF)
7132 need_dict = 1;
7133
7134 if (need_dict) {
7135 PyObject *result = PyDict_New();
7136 PyObject *key, *value;
7137 if (!result)
7138 return NULL;
7139 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007140 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007141 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007142 if (!key || !value)
7143 goto failed1;
7144 if (PyDict_SetItem(result, key, value) == -1)
7145 goto failed1;
7146 Py_DECREF(key);
7147 Py_DECREF(value);
7148 }
7149 return result;
7150 failed1:
7151 Py_XDECREF(key);
7152 Py_XDECREF(value);
7153 Py_DECREF(result);
7154 return NULL;
7155 }
7156
7157 /* Create a three-level trie */
7158 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7159 16*count2 + 128*count3 - 1);
7160 if (!result)
7161 return PyErr_NoMemory();
7162 PyObject_Init(result, &EncodingMapType);
7163 mresult = (struct encoding_map*)result;
7164 mresult->count2 = count2;
7165 mresult->count3 = count3;
7166 mlevel1 = mresult->level1;
7167 mlevel2 = mresult->level23;
7168 mlevel3 = mresult->level23 + 16*count2;
7169 memcpy(mlevel1, level1, 32);
7170 memset(mlevel2, 0xFF, 16*count2);
7171 memset(mlevel3, 0, 128*count3);
7172 count3 = 0;
7173 for (i = 1; i < 256; i++) {
7174 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007175 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007176 /* unmapped character */
7177 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007178 o1 = PyUnicode_READ(kind, data, i)>>11;
7179 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180 i2 = 16*mlevel1[o1] + o2;
7181 if (mlevel2[i2] == 0xFF)
7182 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007183 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007184 i3 = 128*mlevel2[i2] + o3;
7185 mlevel3[i3] = i;
7186 }
7187 return result;
7188}
7189
7190static int
7191encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7192{
7193 struct encoding_map *map = (struct encoding_map*)mapping;
7194 int l1 = c>>11;
7195 int l2 = (c>>7) & 0xF;
7196 int l3 = c & 0x7F;
7197 int i;
7198
7199#ifdef Py_UNICODE_WIDE
7200 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 }
7203#endif
7204 if (c == 0)
7205 return 0;
7206 /* level 1*/
7207 i = map->level1[l1];
7208 if (i == 0xFF) {
7209 return -1;
7210 }
7211 /* level 2*/
7212 i = map->level23[16*i+l2];
7213 if (i == 0xFF) {
7214 return -1;
7215 }
7216 /* level 3 */
7217 i = map->level23[16*map->count2 + 128*i + l3];
7218 if (i == 0) {
7219 return -1;
7220 }
7221 return i;
7222}
7223
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007224/* Lookup the character ch in the mapping. If the character
7225 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007226 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007227static PyObject *
7228charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229{
Christian Heimes217cfd12007-12-02 14:31:20 +00007230 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 PyObject *x;
7232
7233 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235 x = PyObject_GetItem(mapping, w);
7236 Py_DECREF(w);
7237 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7239 /* No mapping found means: mapping is undefined. */
7240 PyErr_Clear();
7241 x = Py_None;
7242 Py_INCREF(x);
7243 return x;
7244 } else
7245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007247 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007249 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 long value = PyLong_AS_LONG(x);
7251 if (value < 0 || value > 255) {
7252 PyErr_SetString(PyExc_TypeError,
7253 "character mapping must be in range(256)");
7254 Py_DECREF(x);
7255 return NULL;
7256 }
7257 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007259 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 /* wrong return value */
7263 PyErr_Format(PyExc_TypeError,
7264 "character mapping must return integer, bytes or None, not %.400s",
7265 x->ob_type->tp_name);
7266 Py_DECREF(x);
7267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 }
7269}
7270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007272charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007273{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007274 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7275 /* exponentially overallocate to minimize reallocations */
7276 if (requiredsize < 2*outsize)
7277 requiredsize = 2*outsize;
7278 if (_PyBytes_Resize(outobj, requiredsize))
7279 return -1;
7280 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007281}
7282
Benjamin Peterson14339b62009-01-31 16:36:08 +00007283typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007285} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007287 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 space is available. Return a new reference to the object that
7289 was put in the output buffer, or Py_None, if the mapping was undefined
7290 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007291 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007292static charmapencode_result
7293charmapencode_output(Py_UNICODE c, PyObject *mapping,
7294 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007295{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007296 PyObject *rep;
7297 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007298 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299
Christian Heimes90aa7642007-12-19 02:45:37 +00007300 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007301 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007303 if (res == -1)
7304 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 if (outsize<requiredsize)
7306 if (charmapencode_resize(outobj, outpos, requiredsize))
7307 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007308 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 outstart[(*outpos)++] = (char)res;
7310 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007311 }
7312
7313 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007314 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 Py_DECREF(rep);
7318 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007319 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 if (PyLong_Check(rep)) {
7321 Py_ssize_t requiredsize = *outpos+1;
7322 if (outsize<requiredsize)
7323 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7324 Py_DECREF(rep);
7325 return enc_EXCEPTION;
7326 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007327 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007329 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 else {
7331 const char *repchars = PyBytes_AS_STRING(rep);
7332 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7333 Py_ssize_t requiredsize = *outpos+repsize;
7334 if (outsize<requiredsize)
7335 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7336 Py_DECREF(rep);
7337 return enc_EXCEPTION;
7338 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007339 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 memcpy(outstart + *outpos, repchars, repsize);
7341 *outpos += repsize;
7342 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007343 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007344 Py_DECREF(rep);
7345 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007346}
7347
7348/* handle an error in PyUnicode_EncodeCharmap
7349 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007350static int
7351charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007352 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007354 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007355 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007356{
7357 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358 Py_ssize_t repsize;
7359 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360 Py_UNICODE *uni2;
7361 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007362 Py_ssize_t collstartpos = *inpos;
7363 Py_ssize_t collendpos = *inpos+1;
7364 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007365 char *encoding = "charmap";
7366 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007367 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 /* find all unencodable characters */
7370 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007371 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007372 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 int res = encoding_map_lookup(p[collendpos], mapping);
7374 if (res != -1)
7375 break;
7376 ++collendpos;
7377 continue;
7378 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007379
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 rep = charmapencode_lookup(p[collendpos], mapping);
7381 if (rep==NULL)
7382 return -1;
7383 else if (rep!=Py_None) {
7384 Py_DECREF(rep);
7385 break;
7386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007387 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 }
7390 /* cache callback name lookup
7391 * (if not done yet, i.e. it's the first error) */
7392 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 if ((errors==NULL) || (!strcmp(errors, "strict")))
7394 *known_errorHandler = 1;
7395 else if (!strcmp(errors, "replace"))
7396 *known_errorHandler = 2;
7397 else if (!strcmp(errors, "ignore"))
7398 *known_errorHandler = 3;
7399 else if (!strcmp(errors, "xmlcharrefreplace"))
7400 *known_errorHandler = 4;
7401 else
7402 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 }
7404 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007405 case 1: /* strict */
7406 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7407 return -1;
7408 case 2: /* replace */
7409 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 x = charmapencode_output('?', mapping, res, respos);
7411 if (x==enc_EXCEPTION) {
7412 return -1;
7413 }
7414 else if (x==enc_FAILED) {
7415 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7416 return -1;
7417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007418 }
7419 /* fall through */
7420 case 3: /* ignore */
7421 *inpos = collendpos;
7422 break;
7423 case 4: /* xmlcharrefreplace */
7424 /* generate replacement (temporarily (mis)uses p) */
7425 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 char buffer[2+29+1+1];
7427 char *cp;
7428 sprintf(buffer, "&#%d;", (int)p[collpos]);
7429 for (cp = buffer; *cp; ++cp) {
7430 x = charmapencode_output(*cp, mapping, res, respos);
7431 if (x==enc_EXCEPTION)
7432 return -1;
7433 else if (x==enc_FAILED) {
7434 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7435 return -1;
7436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007437 }
7438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007439 *inpos = collendpos;
7440 break;
7441 default:
7442 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 encoding, reason, p, size, exceptionObject,
7444 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007445 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007447 if (PyBytes_Check(repunicode)) {
7448 /* Directly copy bytes result to output. */
7449 Py_ssize_t outsize = PyBytes_Size(*res);
7450 Py_ssize_t requiredsize;
7451 repsize = PyBytes_Size(repunicode);
7452 requiredsize = *respos + repsize;
7453 if (requiredsize > outsize)
7454 /* Make room for all additional bytes. */
7455 if (charmapencode_resize(res, respos, requiredsize)) {
7456 Py_DECREF(repunicode);
7457 return -1;
7458 }
7459 memcpy(PyBytes_AsString(*res) + *respos,
7460 PyBytes_AsString(repunicode), repsize);
7461 *respos += repsize;
7462 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007463 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007464 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 /* generate replacement */
7467 repsize = PyUnicode_GET_SIZE(repunicode);
7468 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 x = charmapencode_output(*uni2, mapping, res, respos);
7470 if (x==enc_EXCEPTION) {
7471 return -1;
7472 }
7473 else if (x==enc_FAILED) {
7474 Py_DECREF(repunicode);
7475 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7476 return -1;
7477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 }
7479 *inpos = newpos;
7480 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007481 }
7482 return 0;
7483}
7484
Alexander Belopolsky40018472011-02-26 01:02:56 +00007485PyObject *
7486PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7487 Py_ssize_t size,
7488 PyObject *mapping,
7489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007491 /* output object */
7492 PyObject *res = NULL;
7493 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007494 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007496 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007497 PyObject *errorHandler = NULL;
7498 PyObject *exc = NULL;
7499 /* the following variable is used for caching string comparisons
7500 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7501 * 3=ignore, 4=xmlcharrefreplace */
7502 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
7504 /* Default to Latin-1 */
7505 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 /* allocate enough for a simple encoding without
7509 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007510 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511 if (res == NULL)
7512 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007513 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007516 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 /* try to encode it */
7518 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7519 if (x==enc_EXCEPTION) /* error */
7520 goto onError;
7521 if (x==enc_FAILED) { /* unencodable character */
7522 if (charmap_encoding_error(p, size, &inpos, mapping,
7523 &exc,
7524 &known_errorHandler, &errorHandler, errors,
7525 &res, &respos)) {
7526 goto onError;
7527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 else
7530 /* done with this character => adjust input position */
7531 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007535 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007536 if (_PyBytes_Resize(&res, respos) < 0)
7537 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 Py_XDECREF(exc);
7540 Py_XDECREF(errorHandler);
7541 return res;
7542
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007544 Py_XDECREF(res);
7545 Py_XDECREF(exc);
7546 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 return NULL;
7548}
7549
Alexander Belopolsky40018472011-02-26 01:02:56 +00007550PyObject *
7551PyUnicode_AsCharmapString(PyObject *unicode,
7552 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553{
7554 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 PyErr_BadArgument();
7556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 }
7558 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 PyUnicode_GET_SIZE(unicode),
7560 mapping,
7561 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562}
7563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007565static void
7566make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568 Py_ssize_t startpos, Py_ssize_t endpos,
7569 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007572 *exceptionObject = _PyUnicodeTranslateError_Create(
7573 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 }
7575 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7577 goto onError;
7578 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7579 goto onError;
7580 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7581 goto onError;
7582 return;
7583 onError:
7584 Py_DECREF(*exceptionObject);
7585 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 }
7587}
7588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007590static void
7591raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007592 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007593 Py_ssize_t startpos, Py_ssize_t endpos,
7594 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595{
7596 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007597 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600}
7601
7602/* error handling callback helper:
7603 build arguments, call the callback and check the arguments,
7604 put the result into newpos and return the replacement string, which
7605 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007606static PyObject *
7607unicode_translate_call_errorhandler(const char *errors,
7608 PyObject **errorHandler,
7609 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007610 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007611 Py_ssize_t startpos, Py_ssize_t endpos,
7612 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007613{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007614 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007616 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617 PyObject *restuple;
7618 PyObject *resunicode;
7619
7620 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007622 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624 }
7625
7626 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630
7631 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007636 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 Py_DECREF(restuple);
7638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 }
7640 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 &resunicode, &i_newpos)) {
7642 Py_DECREF(restuple);
7643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007644 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007645 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007646 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 else
7648 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007649 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7651 Py_DECREF(restuple);
7652 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654 Py_INCREF(resunicode);
7655 Py_DECREF(restuple);
7656 return resunicode;
7657}
7658
7659/* Lookup the character ch in the mapping and put the result in result,
7660 which must be decrefed by the caller.
7661 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007662static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007664{
Christian Heimes217cfd12007-12-02 14:31:20 +00007665 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666 PyObject *x;
7667
7668 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670 x = PyObject_GetItem(mapping, w);
7671 Py_DECREF(w);
7672 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7674 /* No mapping found means: use 1:1 mapping. */
7675 PyErr_Clear();
7676 *result = NULL;
7677 return 0;
7678 } else
7679 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 }
7681 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 *result = x;
7683 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007684 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007685 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 long value = PyLong_AS_LONG(x);
7687 long max = PyUnicode_GetMax();
7688 if (value < 0 || value > max) {
7689 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007690 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 Py_DECREF(x);
7692 return -1;
7693 }
7694 *result = x;
7695 return 0;
7696 }
7697 else if (PyUnicode_Check(x)) {
7698 *result = x;
7699 return 0;
7700 }
7701 else {
7702 /* wrong return value */
7703 PyErr_SetString(PyExc_TypeError,
7704 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 Py_DECREF(x);
7706 return -1;
7707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708}
7709/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 if not reallocate and adjust various state variables.
7711 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007712static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007716 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007717 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 /* exponentially overallocate to minimize reallocations */
7719 if (requiredsize < 2 * oldsize)
7720 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007721 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7722 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 }
7726 return 0;
7727}
7728/* lookup the character, put the result in the output string and adjust
7729 various state variables. Return a new reference to the object that
7730 was put in the output buffer in *result, or Py_None, if the mapping was
7731 undefined (in which case no character was written).
7732 The called must decref result.
7733 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007734static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7736 PyObject *mapping, Py_UCS4 **output,
7737 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007738 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007740 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7741 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 }
7747 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007749 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007751 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007752 }
7753 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 Py_ssize_t repsize;
7755 if (PyUnicode_READY(*res) == -1)
7756 return -1;
7757 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 if (repsize==1) {
7759 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007760 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 }
7762 else if (repsize!=0) {
7763 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764 Py_ssize_t requiredsize = *opos +
7765 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007767 Py_ssize_t i;
7768 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007770 for(i = 0; i < repsize; i++)
7771 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007773 }
7774 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776 return 0;
7777}
7778
Alexander Belopolsky40018472011-02-26 01:02:56 +00007779PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007780_PyUnicode_TranslateCharmap(PyObject *input,
7781 PyObject *mapping,
7782 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 /* input object */
7785 char *idata;
7786 Py_ssize_t size, i;
7787 int kind;
7788 /* output buffer */
7789 Py_UCS4 *output = NULL;
7790 Py_ssize_t osize;
7791 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007792 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794 char *reason = "character maps to <undefined>";
7795 PyObject *errorHandler = NULL;
7796 PyObject *exc = NULL;
7797 /* the following variable is used for caching string comparisons
7798 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7799 * 3=ignore, 4=xmlcharrefreplace */
7800 int known_errorHandler = -1;
7801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 PyErr_BadArgument();
7804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 if (PyUnicode_READY(input) == -1)
7808 return NULL;
7809 idata = (char*)PyUnicode_DATA(input);
7810 kind = PyUnicode_KIND(input);
7811 size = PyUnicode_GET_LENGTH(input);
7812 i = 0;
7813
7814 if (size == 0) {
7815 Py_INCREF(input);
7816 return input;
7817 }
7818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 /* allocate enough for a simple 1:1 translation without
7820 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 osize = size;
7822 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7823 opos = 0;
7824 if (output == NULL) {
7825 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007829 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 /* try to encode it */
7831 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 if (charmaptranslate_output(input, i, mapping,
7833 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 Py_XDECREF(x);
7835 goto onError;
7836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 else { /* untranslatable character */
7841 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7842 Py_ssize_t repsize;
7843 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 Py_ssize_t collstart = i;
7847 Py_ssize_t collend = i+1;
7848 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 while (collend < size) {
7852 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 goto onError;
7854 Py_XDECREF(x);
7855 if (x!=Py_None)
7856 break;
7857 ++collend;
7858 }
7859 /* cache callback name lookup
7860 * (if not done yet, i.e. it's the first error) */
7861 if (known_errorHandler==-1) {
7862 if ((errors==NULL) || (!strcmp(errors, "strict")))
7863 known_errorHandler = 1;
7864 else if (!strcmp(errors, "replace"))
7865 known_errorHandler = 2;
7866 else if (!strcmp(errors, "ignore"))
7867 known_errorHandler = 3;
7868 else if (!strcmp(errors, "xmlcharrefreplace"))
7869 known_errorHandler = 4;
7870 else
7871 known_errorHandler = 0;
7872 }
7873 switch (known_errorHandler) {
7874 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007875 raise_translate_exception(&exc, input, collstart,
7876 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 case 2: /* replace */
7879 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007880 for (coll = collstart; coll<collend; coll++)
7881 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 /* fall through */
7883 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 break;
7886 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 /* generate replacement (temporarily (mis)uses i) */
7888 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 char buffer[2+29+1+1];
7890 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7892 if (charmaptranslate_makespace(&output, &osize,
7893 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 goto onError;
7895 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 break;
7900 default:
7901 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 reason, input, &exc,
7903 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007904 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 goto onError;
7906 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 repsize = PyUnicode_GET_LENGTH(repunicode);
7908 if (charmaptranslate_makespace(&output, &osize,
7909 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 Py_DECREF(repunicode);
7911 goto onError;
7912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 for (uni2 = 0; repsize-->0; ++uni2)
7914 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7915 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 }
7919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7921 if (!res)
7922 goto onError;
7923 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007924 Py_XDECREF(exc);
7925 Py_XDECREF(errorHandler);
7926 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007930 Py_XDECREF(exc);
7931 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 return NULL;
7933}
7934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935/* Deprecated. Use PyUnicode_Translate instead. */
7936PyObject *
7937PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7938 Py_ssize_t size,
7939 PyObject *mapping,
7940 const char *errors)
7941{
7942 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7943 if (!unicode)
7944 return NULL;
7945 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7946}
7947
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948PyObject *
7949PyUnicode_Translate(PyObject *str,
7950 PyObject *mapping,
7951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952{
7953 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 str = PyUnicode_FromObject(str);
7956 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 Py_DECREF(str);
7960 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007961
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 Py_XDECREF(str);
7964 return NULL;
7965}
Tim Petersced69f82003-09-16 20:30:58 +00007966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967static Py_UCS4
7968fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7969{
7970 /* No need to call PyUnicode_READY(self) because this function is only
7971 called as a callback from fixup() which does it already. */
7972 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7973 const int kind = PyUnicode_KIND(self);
7974 void *data = PyUnicode_DATA(self);
7975 Py_UCS4 maxchar = 0, ch, fixed;
7976 Py_ssize_t i;
7977
7978 for (i = 0; i < len; ++i) {
7979 ch = PyUnicode_READ(kind, data, i);
7980 fixed = 0;
7981 if (ch > 127) {
7982 if (Py_UNICODE_ISSPACE(ch))
7983 fixed = ' ';
7984 else {
7985 const int decimal = Py_UNICODE_TODECIMAL(ch);
7986 if (decimal >= 0)
7987 fixed = '0' + decimal;
7988 }
7989 if (fixed != 0) {
7990 if (fixed > maxchar)
7991 maxchar = fixed;
7992 PyUnicode_WRITE(kind, data, i, fixed);
7993 }
7994 else if (ch > maxchar)
7995 maxchar = ch;
7996 }
7997 else if (ch > maxchar)
7998 maxchar = ch;
7999 }
8000
8001 return maxchar;
8002}
8003
8004PyObject *
8005_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8006{
8007 if (!PyUnicode_Check(unicode)) {
8008 PyErr_BadInternalCall();
8009 return NULL;
8010 }
8011 if (PyUnicode_READY(unicode) == -1)
8012 return NULL;
8013 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8014 /* If the string is already ASCII, just return the same string */
8015 Py_INCREF(unicode);
8016 return unicode;
8017 }
8018 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8019}
8020
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008021PyObject *
8022PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8023 Py_ssize_t length)
8024{
8025 PyObject *result;
8026 Py_UNICODE *p; /* write pointer into result */
8027 Py_ssize_t i;
8028 /* Copy to a new string */
8029 result = (PyObject *)_PyUnicode_New(length);
8030 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8031 if (result == NULL)
8032 return result;
8033 p = PyUnicode_AS_UNICODE(result);
8034 /* Iterate over code points */
8035 for (i = 0; i < length; i++) {
8036 Py_UNICODE ch =s[i];
8037 if (ch > 127) {
8038 int decimal = Py_UNICODE_TODECIMAL(ch);
8039 if (decimal >= 0)
8040 p[i] = '0' + decimal;
8041 }
8042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8044 Py_DECREF(result);
8045 return NULL;
8046 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008047 return result;
8048}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008049/* --- Decimal Encoder ---------------------------------------------------- */
8050
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051int
8052PyUnicode_EncodeDecimal(Py_UNICODE *s,
8053 Py_ssize_t length,
8054 char *output,
8055 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008056{
8057 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008058 PyObject *errorHandler = NULL;
8059 PyObject *exc = NULL;
8060 const char *encoding = "decimal";
8061 const char *reason = "invalid decimal Unicode string";
8062 /* the following variable is used for caching string comparisons
8063 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8064 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008065
8066 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyErr_BadArgument();
8068 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008069 }
8070
8071 p = s;
8072 end = s + length;
8073 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 register Py_UNICODE ch = *p;
8075 int decimal;
8076 PyObject *repunicode;
8077 Py_ssize_t repsize;
8078 Py_ssize_t newpos;
8079 Py_UNICODE *uni2;
8080 Py_UNICODE *collstart;
8081 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008082
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008084 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 ++p;
8086 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 decimal = Py_UNICODE_TODECIMAL(ch);
8089 if (decimal >= 0) {
8090 *output++ = '0' + decimal;
8091 ++p;
8092 continue;
8093 }
8094 if (0 < ch && ch < 256) {
8095 *output++ = (char)ch;
8096 ++p;
8097 continue;
8098 }
8099 /* All other characters are considered unencodable */
8100 collstart = p;
8101 collend = p+1;
8102 while (collend < end) {
8103 if ((0 < *collend && *collend < 256) ||
8104 !Py_UNICODE_ISSPACE(*collend) ||
8105 Py_UNICODE_TODECIMAL(*collend))
8106 break;
8107 }
8108 /* cache callback name lookup
8109 * (if not done yet, i.e. it's the first error) */
8110 if (known_errorHandler==-1) {
8111 if ((errors==NULL) || (!strcmp(errors, "strict")))
8112 known_errorHandler = 1;
8113 else if (!strcmp(errors, "replace"))
8114 known_errorHandler = 2;
8115 else if (!strcmp(errors, "ignore"))
8116 known_errorHandler = 3;
8117 else if (!strcmp(errors, "xmlcharrefreplace"))
8118 known_errorHandler = 4;
8119 else
8120 known_errorHandler = 0;
8121 }
8122 switch (known_errorHandler) {
8123 case 1: /* strict */
8124 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8125 goto onError;
8126 case 2: /* replace */
8127 for (p = collstart; p < collend; ++p)
8128 *output++ = '?';
8129 /* fall through */
8130 case 3: /* ignore */
8131 p = collend;
8132 break;
8133 case 4: /* xmlcharrefreplace */
8134 /* generate replacement (temporarily (mis)uses p) */
8135 for (p = collstart; p < collend; ++p)
8136 output += sprintf(output, "&#%d;", (int)*p);
8137 p = collend;
8138 break;
8139 default:
8140 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8141 encoding, reason, s, length, &exc,
8142 collstart-s, collend-s, &newpos);
8143 if (repunicode == NULL)
8144 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008145 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008146 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008147 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8148 Py_DECREF(repunicode);
8149 goto onError;
8150 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 /* generate replacement */
8152 repsize = PyUnicode_GET_SIZE(repunicode);
8153 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8154 Py_UNICODE ch = *uni2;
8155 if (Py_UNICODE_ISSPACE(ch))
8156 *output++ = ' ';
8157 else {
8158 decimal = Py_UNICODE_TODECIMAL(ch);
8159 if (decimal >= 0)
8160 *output++ = '0' + decimal;
8161 else if (0 < ch && ch < 256)
8162 *output++ = (char)ch;
8163 else {
8164 Py_DECREF(repunicode);
8165 raise_encode_exception(&exc, encoding,
8166 s, length, collstart-s, collend-s, reason);
8167 goto onError;
8168 }
8169 }
8170 }
8171 p = s + newpos;
8172 Py_DECREF(repunicode);
8173 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008174 }
8175 /* 0-terminate the output string */
8176 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008177 Py_XDECREF(exc);
8178 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008179 return 0;
8180
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182 Py_XDECREF(exc);
8183 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008184 return -1;
8185}
8186
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187/* --- Helpers ------------------------------------------------------------ */
8188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189#include "stringlib/ucs1lib.h"
8190#include "stringlib/fastsearch.h"
8191#include "stringlib/partition.h"
8192#include "stringlib/split.h"
8193#include "stringlib/count.h"
8194#include "stringlib/find.h"
8195#include "stringlib/localeutil.h"
8196#include "stringlib/undef.h"
8197
8198#include "stringlib/ucs2lib.h"
8199#include "stringlib/fastsearch.h"
8200#include "stringlib/partition.h"
8201#include "stringlib/split.h"
8202#include "stringlib/count.h"
8203#include "stringlib/find.h"
8204#include "stringlib/localeutil.h"
8205#include "stringlib/undef.h"
8206
8207#include "stringlib/ucs4lib.h"
8208#include "stringlib/fastsearch.h"
8209#include "stringlib/partition.h"
8210#include "stringlib/split.h"
8211#include "stringlib/count.h"
8212#include "stringlib/find.h"
8213#include "stringlib/localeutil.h"
8214#include "stringlib/undef.h"
8215
8216static Py_ssize_t
8217any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8218 const Py_UCS1*, Py_ssize_t,
8219 Py_ssize_t, Py_ssize_t),
8220 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8221 const Py_UCS2*, Py_ssize_t,
8222 Py_ssize_t, Py_ssize_t),
8223 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8224 const Py_UCS4*, Py_ssize_t,
8225 Py_ssize_t, Py_ssize_t),
8226 PyObject* s1, PyObject* s2,
8227 Py_ssize_t start,
8228 Py_ssize_t end)
8229{
8230 int kind1, kind2, kind;
8231 void *buf1, *buf2;
8232 Py_ssize_t len1, len2, result;
8233
8234 kind1 = PyUnicode_KIND(s1);
8235 kind2 = PyUnicode_KIND(s2);
8236 kind = kind1 > kind2 ? kind1 : kind2;
8237 buf1 = PyUnicode_DATA(s1);
8238 buf2 = PyUnicode_DATA(s2);
8239 if (kind1 != kind)
8240 buf1 = _PyUnicode_AsKind(s1, kind);
8241 if (!buf1)
8242 return -2;
8243 if (kind2 != kind)
8244 buf2 = _PyUnicode_AsKind(s2, kind);
8245 if (!buf2) {
8246 if (kind1 != kind) PyMem_Free(buf1);
8247 return -2;
8248 }
8249 len1 = PyUnicode_GET_LENGTH(s1);
8250 len2 = PyUnicode_GET_LENGTH(s2);
8251
8252 switch(kind) {
8253 case PyUnicode_1BYTE_KIND:
8254 result = ucs1(buf1, len1, buf2, len2, start, end);
8255 break;
8256 case PyUnicode_2BYTE_KIND:
8257 result = ucs2(buf1, len1, buf2, len2, start, end);
8258 break;
8259 case PyUnicode_4BYTE_KIND:
8260 result = ucs4(buf1, len1, buf2, len2, start, end);
8261 break;
8262 default:
8263 assert(0); result = -2;
8264 }
8265
8266 if (kind1 != kind)
8267 PyMem_Free(buf1);
8268 if (kind2 != kind)
8269 PyMem_Free(buf2);
8270
8271 return result;
8272}
8273
8274Py_ssize_t
8275_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8276 Py_ssize_t n_buffer,
8277 void *digits, Py_ssize_t n_digits,
8278 Py_ssize_t min_width,
8279 const char *grouping,
8280 const char *thousands_sep)
8281{
8282 switch(kind) {
8283 case PyUnicode_1BYTE_KIND:
8284 return _PyUnicode_ucs1_InsertThousandsGrouping(
8285 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8286 min_width, grouping, thousands_sep);
8287 case PyUnicode_2BYTE_KIND:
8288 return _PyUnicode_ucs2_InsertThousandsGrouping(
8289 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8290 min_width, grouping, thousands_sep);
8291 case PyUnicode_4BYTE_KIND:
8292 return _PyUnicode_ucs4_InsertThousandsGrouping(
8293 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8294 min_width, grouping, thousands_sep);
8295 }
8296 assert(0);
8297 return -1;
8298}
8299
8300
Eric Smith8c663262007-08-25 02:26:07 +00008301#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008302#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008303
Thomas Wouters477c8d52006-05-27 19:21:47 +00008304#include "stringlib/count.h"
8305#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008306
Thomas Wouters477c8d52006-05-27 19:21:47 +00008307/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008308#define ADJUST_INDICES(start, end, len) \
8309 if (end > len) \
8310 end = len; \
8311 else if (end < 0) { \
8312 end += len; \
8313 if (end < 0) \
8314 end = 0; \
8315 } \
8316 if (start < 0) { \
8317 start += len; \
8318 if (start < 0) \
8319 start = 0; \
8320 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322Py_ssize_t
8323PyUnicode_Count(PyObject *str,
8324 PyObject *substr,
8325 Py_ssize_t start,
8326 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008328 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008329 PyUnicodeObject* str_obj;
8330 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 int kind1, kind2, kind;
8332 void *buf1 = NULL, *buf2 = NULL;
8333 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008334
Thomas Wouters477c8d52006-05-27 19:21:47 +00008335 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008339 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 Py_DECREF(str_obj);
8341 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
Tim Petersced69f82003-09-16 20:30:58 +00008343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 kind1 = PyUnicode_KIND(str_obj);
8345 kind2 = PyUnicode_KIND(sub_obj);
8346 kind = kind1 > kind2 ? kind1 : kind2;
8347 buf1 = PyUnicode_DATA(str_obj);
8348 if (kind1 != kind)
8349 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8350 if (!buf1)
8351 goto onError;
8352 buf2 = PyUnicode_DATA(sub_obj);
8353 if (kind2 != kind)
8354 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8355 if (!buf2)
8356 goto onError;
8357 len1 = PyUnicode_GET_LENGTH(str_obj);
8358 len2 = PyUnicode_GET_LENGTH(sub_obj);
8359
8360 ADJUST_INDICES(start, end, len1);
8361 switch(kind) {
8362 case PyUnicode_1BYTE_KIND:
8363 result = ucs1lib_count(
8364 ((Py_UCS1*)buf1) + start, end - start,
8365 buf2, len2, PY_SSIZE_T_MAX
8366 );
8367 break;
8368 case PyUnicode_2BYTE_KIND:
8369 result = ucs2lib_count(
8370 ((Py_UCS2*)buf1) + start, end - start,
8371 buf2, len2, PY_SSIZE_T_MAX
8372 );
8373 break;
8374 case PyUnicode_4BYTE_KIND:
8375 result = ucs4lib_count(
8376 ((Py_UCS4*)buf1) + start, end - start,
8377 buf2, len2, PY_SSIZE_T_MAX
8378 );
8379 break;
8380 default:
8381 assert(0); result = 0;
8382 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383
8384 Py_DECREF(sub_obj);
8385 Py_DECREF(str_obj);
8386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 if (kind1 != kind)
8388 PyMem_Free(buf1);
8389 if (kind2 != kind)
8390 PyMem_Free(buf2);
8391
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 onError:
8394 Py_DECREF(sub_obj);
8395 Py_DECREF(str_obj);
8396 if (kind1 != kind && buf1)
8397 PyMem_Free(buf1);
8398 if (kind2 != kind && buf2)
8399 PyMem_Free(buf2);
8400 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401}
8402
Alexander Belopolsky40018472011-02-26 01:02:56 +00008403Py_ssize_t
8404PyUnicode_Find(PyObject *str,
8405 PyObject *sub,
8406 Py_ssize_t start,
8407 Py_ssize_t end,
8408 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008410 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008411
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008415 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 Py_DECREF(str);
8418 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 }
Tim Petersced69f82003-09-16 20:30:58 +00008420
Thomas Wouters477c8d52006-05-27 19:21:47 +00008421 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 result = any_find_slice(
8423 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8424 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425 );
8426 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 result = any_find_slice(
8428 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8429 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008430 );
8431
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008433 Py_DECREF(sub);
8434
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 return result;
8436}
8437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438Py_ssize_t
8439PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8440 Py_ssize_t start, Py_ssize_t end,
8441 int direction)
8442{
8443 char *result;
8444 int kind;
8445 if (PyUnicode_READY(str) == -1)
8446 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008447 if (start < 0 || end < 0) {
8448 PyErr_SetString(PyExc_IndexError, "string index out of range");
8449 return -2;
8450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 if (end > PyUnicode_GET_LENGTH(str))
8452 end = PyUnicode_GET_LENGTH(str);
8453 kind = PyUnicode_KIND(str);
8454 result = findchar(PyUnicode_1BYTE_DATA(str)
8455 + PyUnicode_KIND_SIZE(kind, start),
8456 kind,
8457 end-start, ch, direction);
8458 if (!result)
8459 return -1;
8460 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8461}
8462
Alexander Belopolsky40018472011-02-26 01:02:56 +00008463static int
8464tailmatch(PyUnicodeObject *self,
8465 PyUnicodeObject *substring,
8466 Py_ssize_t start,
8467 Py_ssize_t end,
8468 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 int kind_self;
8471 int kind_sub;
8472 void *data_self;
8473 void *data_sub;
8474 Py_ssize_t offset;
8475 Py_ssize_t i;
8476 Py_ssize_t end_sub;
8477
8478 if (PyUnicode_READY(self) == -1 ||
8479 PyUnicode_READY(substring) == -1)
8480 return 0;
8481
8482 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 return 1;
8484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8486 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 kind_self = PyUnicode_KIND(self);
8491 data_self = PyUnicode_DATA(self);
8492 kind_sub = PyUnicode_KIND(substring);
8493 data_sub = PyUnicode_DATA(substring);
8494 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8495
8496 if (direction > 0)
8497 offset = end;
8498 else
8499 offset = start;
8500
8501 if (PyUnicode_READ(kind_self, data_self, offset) ==
8502 PyUnicode_READ(kind_sub, data_sub, 0) &&
8503 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8504 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8505 /* If both are of the same kind, memcmp is sufficient */
8506 if (kind_self == kind_sub) {
8507 return ! memcmp((char *)data_self +
8508 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8509 data_sub,
8510 PyUnicode_GET_LENGTH(substring) *
8511 PyUnicode_CHARACTER_SIZE(substring));
8512 }
8513 /* otherwise we have to compare each character by first accesing it */
8514 else {
8515 /* We do not need to compare 0 and len(substring)-1 because
8516 the if statement above ensured already that they are equal
8517 when we end up here. */
8518 // TODO: honor direction and do a forward or backwards search
8519 for (i = 1; i < end_sub; ++i) {
8520 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8521 PyUnicode_READ(kind_sub, data_sub, i))
8522 return 0;
8523 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 }
8527
8528 return 0;
8529}
8530
Alexander Belopolsky40018472011-02-26 01:02:56 +00008531Py_ssize_t
8532PyUnicode_Tailmatch(PyObject *str,
8533 PyObject *substr,
8534 Py_ssize_t start,
8535 Py_ssize_t end,
8536 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008539
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 str = PyUnicode_FromObject(str);
8541 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 substr = PyUnicode_FromObject(substr);
8544 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 Py_DECREF(str);
8546 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 }
Tim Petersced69f82003-09-16 20:30:58 +00008548
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 (PyUnicodeObject *)substr,
8551 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 Py_DECREF(str);
8553 Py_DECREF(substr);
8554 return result;
8555}
8556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557/* Apply fixfct filter to the Unicode object self and return a
8558 reference to the modified object */
8559
Alexander Belopolsky40018472011-02-26 01:02:56 +00008560static PyObject *
8561fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 PyObject *u;
8565 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 if (PyUnicode_READY(self) == -1)
8568 return NULL;
8569 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8570 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8571 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8576 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 /* fix functions return the new maximum character in a string,
8579 if the kind of the resulting unicode object does not change,
8580 everything is fine. Otherwise we need to change the string kind
8581 and re-run the fix function. */
8582 maxchar_new = fixfct((PyUnicodeObject*)u);
8583 if (maxchar_new == 0)
8584 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8585 else if (maxchar_new <= 127)
8586 maxchar_new = 127;
8587 else if (maxchar_new <= 255)
8588 maxchar_new = 255;
8589 else if (maxchar_new <= 65535)
8590 maxchar_new = 65535;
8591 else
8592 maxchar_new = 1114111; /* 0x10ffff */
8593
8594 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 /* fixfct should return TRUE if it modified the buffer. If
8596 FALSE, return a reference to the original buffer instead
8597 (to save space, not time) */
8598 Py_INCREF(self);
8599 Py_DECREF(u);
8600 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 else if (maxchar_new == maxchar_old) {
8603 return u;
8604 }
8605 else {
8606 /* In case the maximum character changed, we need to
8607 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008608 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 if (v == NULL) {
8610 Py_DECREF(u);
8611 return NULL;
8612 }
8613 if (maxchar_new > maxchar_old) {
8614 /* If the maxchar increased so that the kind changed, not all
8615 characters are representable anymore and we need to fix the
8616 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008617 if (PyUnicode_CopyCharacters(v, 0,
8618 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008619 PyUnicode_GET_LENGTH(self)) < 0)
8620 {
8621 Py_DECREF(u);
8622 return NULL;
8623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 maxchar_old = fixfct((PyUnicodeObject*)v);
8625 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8626 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008627 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008628 if (PyUnicode_CopyCharacters(v, 0,
8629 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008630 PyUnicode_GET_LENGTH(self)) < 0)
8631 {
8632 Py_DECREF(u);
8633 return NULL;
8634 }
8635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636
8637 Py_DECREF(u);
8638 return v;
8639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640}
8641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 /* No need to call PyUnicode_READY(self) because this function is only
8646 called as a callback from fixup() which does it already. */
8647 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8648 const int kind = PyUnicode_KIND(self);
8649 void *data = PyUnicode_DATA(self);
8650 int touched = 0;
8651 Py_UCS4 maxchar = 0;
8652 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 for (i = 0; i < len; ++i) {
8655 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8656 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8657 if (up != ch) {
8658 if (up > maxchar)
8659 maxchar = up;
8660 PyUnicode_WRITE(kind, data, i, up);
8661 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 else if (ch > maxchar)
8664 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 if (touched)
8668 return maxchar;
8669 else
8670 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671}
8672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8677 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8678 const int kind = PyUnicode_KIND(self);
8679 void *data = PyUnicode_DATA(self);
8680 int touched = 0;
8681 Py_UCS4 maxchar = 0;
8682 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 for(i = 0; i < len; ++i) {
8685 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8686 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8687 if (lo != ch) {
8688 if (lo > maxchar)
8689 maxchar = lo;
8690 PyUnicode_WRITE(kind, data, i, lo);
8691 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 else if (ch > maxchar)
8694 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 }
8696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (touched)
8698 return maxchar;
8699 else
8700 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8707 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8708 const int kind = PyUnicode_KIND(self);
8709 void *data = PyUnicode_DATA(self);
8710 int touched = 0;
8711 Py_UCS4 maxchar = 0;
8712 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 for(i = 0; i < len; ++i) {
8715 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8716 Py_UCS4 nu = 0;
8717
8718 if (Py_UNICODE_ISUPPER(ch))
8719 nu = Py_UNICODE_TOLOWER(ch);
8720 else if (Py_UNICODE_ISLOWER(ch))
8721 nu = Py_UNICODE_TOUPPER(ch);
8722
8723 if (nu != 0) {
8724 if (nu > maxchar)
8725 maxchar = nu;
8726 PyUnicode_WRITE(kind, data, i, nu);
8727 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 else if (ch > maxchar)
8730 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 }
8732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 if (touched)
8734 return maxchar;
8735 else
8736 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737}
8738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008740fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8743 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8744 const int kind = PyUnicode_KIND(self);
8745 void *data = PyUnicode_DATA(self);
8746 int touched = 0;
8747 Py_UCS4 maxchar = 0;
8748 Py_ssize_t i = 0;
8749 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008750
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753
8754 ch = PyUnicode_READ(kind, data, i);
8755 if (!Py_UNICODE_ISUPPER(ch)) {
8756 maxchar = Py_UNICODE_TOUPPER(ch);
8757 PyUnicode_WRITE(kind, data, i, maxchar);
8758 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 ++i;
8761 for(; i < len; ++i) {
8762 ch = PyUnicode_READ(kind, data, i);
8763 if (!Py_UNICODE_ISLOWER(ch)) {
8764 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8765 if (lo > maxchar)
8766 maxchar = lo;
8767 PyUnicode_WRITE(kind, data, i, lo);
8768 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 else if (ch > maxchar)
8771 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773
8774 if (touched)
8775 return maxchar;
8776 else
8777 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778}
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008781fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8784 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8785 const int kind = PyUnicode_KIND(self);
8786 void *data = PyUnicode_DATA(self);
8787 Py_UCS4 maxchar = 0;
8788 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 int previous_is_cased;
8790
8791 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 if (len == 1) {
8793 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8794 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8795 if (ti != ch) {
8796 PyUnicode_WRITE(kind, data, i, ti);
8797 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 }
8799 else
8800 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 for(; i < len; ++i) {
8804 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8805 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008806
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 nu = Py_UNICODE_TOTITLE(ch);
8811
8812 if (nu > maxchar)
8813 maxchar = nu;
8814 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008815
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 if (Py_UNICODE_ISLOWER(ch) ||
8817 Py_UNICODE_ISUPPER(ch) ||
8818 Py_UNICODE_ISTITLE(ch))
8819 previous_is_cased = 1;
8820 else
8821 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824}
8825
Tim Peters8ce9f162004-08-27 01:49:32 +00008826PyObject *
8827PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008830 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008832 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008833 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8834 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008835 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 Py_ssize_t sz, i, res_offset;
8837 Py_UCS4 maxchar = 0;
8838 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839
Tim Peters05eba1f2004-08-27 21:32:02 +00008840 fseq = PySequence_Fast(seq, "");
8841 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008842 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008843 }
8844
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008845 /* NOTE: the following code can't call back into Python code,
8846 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008847 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008848
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 seqlen = PySequence_Fast_GET_SIZE(fseq);
8850 /* If empty sequence, return u"". */
8851 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008853 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008854 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008855 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008856 /* If singleton sequence with an exact Unicode, return that. */
8857 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 item = items[0];
8859 if (PyUnicode_CheckExact(item)) {
8860 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 goto Done;
8863 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008864 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008865 else {
8866 /* Set up sep and seplen */
8867 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 /* fall back to a blank space separator */
8869 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008870 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008872 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008873 else {
8874 if (!PyUnicode_Check(separator)) {
8875 PyErr_Format(PyExc_TypeError,
8876 "separator: expected str instance,"
8877 " %.80s found",
8878 Py_TYPE(separator)->tp_name);
8879 goto onError;
8880 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008881 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 goto onError;
8883 sep = separator;
8884 seplen = PyUnicode_GET_LENGTH(separator);
8885 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8886 /* inc refcount to keep this code path symetric with the
8887 above case of a blank separator */
8888 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008889 }
8890 }
8891
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008892 /* There are at least two things to join, or else we have a subclass
8893 * of str in the sequence.
8894 * Do a pre-pass to figure out the total amount of space we'll
8895 * need (sz), and see whether all argument are strings.
8896 */
8897 sz = 0;
8898 for (i = 0; i < seqlen; i++) {
8899 const Py_ssize_t old_sz = sz;
8900 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 if (!PyUnicode_Check(item)) {
8902 PyErr_Format(PyExc_TypeError,
8903 "sequence item %zd: expected str instance,"
8904 " %.80s found",
8905 i, Py_TYPE(item)->tp_name);
8906 goto onError;
8907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (PyUnicode_READY(item) == -1)
8909 goto onError;
8910 sz += PyUnicode_GET_LENGTH(item);
8911 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8912 if (item_maxchar > maxchar)
8913 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008914 if (i != 0)
8915 sz += seplen;
8916 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8917 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008919 goto onError;
8920 }
8921 }
Tim Petersced69f82003-09-16 20:30:58 +00008922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008924 if (res == NULL)
8925 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008926
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008927 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008929 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008930 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008932 if (i && seplen != 0) {
8933 copied = PyUnicode_CopyCharacters(res, res_offset,
8934 sep, 0, seplen);
8935 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008936 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008937#ifdef Py_DEBUG
8938 res_offset += copied;
8939#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008943 itemlen = PyUnicode_GET_LENGTH(item);
8944 if (itemlen != 0) {
8945 copied = PyUnicode_CopyCharacters(res, res_offset,
8946 item, 0, itemlen);
8947 if (copied < 0)
8948 goto onError;
8949#ifdef Py_DEBUG
8950 res_offset += copied;
8951#else
8952 res_offset += itemlen;
8953#endif
8954 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008957
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008959 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 Py_XDECREF(sep);
8961 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008964 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008966 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 return NULL;
8968}
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970#define FILL(kind, data, value, start, length) \
8971 do { \
8972 Py_ssize_t i_ = 0; \
8973 assert(kind != PyUnicode_WCHAR_KIND); \
8974 switch ((kind)) { \
8975 case PyUnicode_1BYTE_KIND: { \
8976 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8977 memset(to_, (unsigned char)value, length); \
8978 break; \
8979 } \
8980 case PyUnicode_2BYTE_KIND: { \
8981 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8982 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8983 break; \
8984 } \
8985 default: { \
8986 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8987 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8988 break; \
8989 } \
8990 } \
8991 } while (0)
8992
Alexander Belopolsky40018472011-02-26 01:02:56 +00008993static PyUnicodeObject *
8994pad(PyUnicodeObject *self,
8995 Py_ssize_t left,
8996 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 PyObject *u;
9000 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009001 int kind;
9002 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
9004 if (left < 0)
9005 left = 0;
9006 if (right < 0)
9007 right = 0;
9008
Tim Peters7a29bd52001-09-12 03:03:31 +00009009 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 Py_INCREF(self);
9011 return self;
9012 }
9013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9015 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009016 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9017 return NULL;
9018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9020 if (fill > maxchar)
9021 maxchar = fill;
9022 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009023 if (!u)
9024 return NULL;
9025
9026 kind = PyUnicode_KIND(u);
9027 data = PyUnicode_DATA(u);
9028 if (left)
9029 FILL(kind, data, fill, 0, left);
9030 if (right)
9031 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009032 if (PyUnicode_CopyCharacters(u, left,
9033 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009034 _PyUnicode_LENGTH(self)) < 0)
9035 {
9036 Py_DECREF(u);
9037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 }
9039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
Alexander Belopolsky40018472011-02-26 01:02:56 +00009044PyObject *
9045PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048
9049 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 switch(PyUnicode_KIND(string)) {
9054 case PyUnicode_1BYTE_KIND:
9055 list = ucs1lib_splitlines(
9056 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9057 PyUnicode_GET_LENGTH(string), keepends);
9058 break;
9059 case PyUnicode_2BYTE_KIND:
9060 list = ucs2lib_splitlines(
9061 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9062 PyUnicode_GET_LENGTH(string), keepends);
9063 break;
9064 case PyUnicode_4BYTE_KIND:
9065 list = ucs4lib_splitlines(
9066 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9067 PyUnicode_GET_LENGTH(string), keepends);
9068 break;
9069 default:
9070 assert(0);
9071 list = 0;
9072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 Py_DECREF(string);
9074 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075}
9076
Alexander Belopolsky40018472011-02-26 01:02:56 +00009077static PyObject *
9078split(PyUnicodeObject *self,
9079 PyUnicodeObject *substring,
9080 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 int kind1, kind2, kind;
9083 void *buf1, *buf2;
9084 Py_ssize_t len1, len2;
9085 PyObject* out;
9086
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009088 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (PyUnicode_READY(self) == -1)
9091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 if (substring == NULL)
9094 switch(PyUnicode_KIND(self)) {
9095 case PyUnicode_1BYTE_KIND:
9096 return ucs1lib_split_whitespace(
9097 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9098 PyUnicode_GET_LENGTH(self), maxcount
9099 );
9100 case PyUnicode_2BYTE_KIND:
9101 return ucs2lib_split_whitespace(
9102 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9103 PyUnicode_GET_LENGTH(self), maxcount
9104 );
9105 case PyUnicode_4BYTE_KIND:
9106 return ucs4lib_split_whitespace(
9107 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9108 PyUnicode_GET_LENGTH(self), maxcount
9109 );
9110 default:
9111 assert(0);
9112 return NULL;
9113 }
9114
9115 if (PyUnicode_READY(substring) == -1)
9116 return NULL;
9117
9118 kind1 = PyUnicode_KIND(self);
9119 kind2 = PyUnicode_KIND(substring);
9120 kind = kind1 > kind2 ? kind1 : kind2;
9121 buf1 = PyUnicode_DATA(self);
9122 buf2 = PyUnicode_DATA(substring);
9123 if (kind1 != kind)
9124 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9125 if (!buf1)
9126 return NULL;
9127 if (kind2 != kind)
9128 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9129 if (!buf2) {
9130 if (kind1 != kind) PyMem_Free(buf1);
9131 return NULL;
9132 }
9133 len1 = PyUnicode_GET_LENGTH(self);
9134 len2 = PyUnicode_GET_LENGTH(substring);
9135
9136 switch(kind) {
9137 case PyUnicode_1BYTE_KIND:
9138 out = ucs1lib_split(
9139 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9140 break;
9141 case PyUnicode_2BYTE_KIND:
9142 out = ucs2lib_split(
9143 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9144 break;
9145 case PyUnicode_4BYTE_KIND:
9146 out = ucs4lib_split(
9147 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9148 break;
9149 default:
9150 out = NULL;
9151 }
9152 if (kind1 != kind)
9153 PyMem_Free(buf1);
9154 if (kind2 != kind)
9155 PyMem_Free(buf2);
9156 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157}
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159static PyObject *
9160rsplit(PyUnicodeObject *self,
9161 PyUnicodeObject *substring,
9162 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 int kind1, kind2, kind;
9165 void *buf1, *buf2;
9166 Py_ssize_t len1, len2;
9167 PyObject* out;
9168
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009169 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009170 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 if (PyUnicode_READY(self) == -1)
9173 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 if (substring == NULL)
9176 switch(PyUnicode_KIND(self)) {
9177 case PyUnicode_1BYTE_KIND:
9178 return ucs1lib_rsplit_whitespace(
9179 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9180 PyUnicode_GET_LENGTH(self), maxcount
9181 );
9182 case PyUnicode_2BYTE_KIND:
9183 return ucs2lib_rsplit_whitespace(
9184 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9185 PyUnicode_GET_LENGTH(self), maxcount
9186 );
9187 case PyUnicode_4BYTE_KIND:
9188 return ucs4lib_rsplit_whitespace(
9189 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9190 PyUnicode_GET_LENGTH(self), maxcount
9191 );
9192 default:
9193 assert(0);
9194 return NULL;
9195 }
9196
9197 if (PyUnicode_READY(substring) == -1)
9198 return NULL;
9199
9200 kind1 = PyUnicode_KIND(self);
9201 kind2 = PyUnicode_KIND(substring);
9202 kind = kind1 > kind2 ? kind1 : kind2;
9203 buf1 = PyUnicode_DATA(self);
9204 buf2 = PyUnicode_DATA(substring);
9205 if (kind1 != kind)
9206 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9207 if (!buf1)
9208 return NULL;
9209 if (kind2 != kind)
9210 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9211 if (!buf2) {
9212 if (kind1 != kind) PyMem_Free(buf1);
9213 return NULL;
9214 }
9215 len1 = PyUnicode_GET_LENGTH(self);
9216 len2 = PyUnicode_GET_LENGTH(substring);
9217
9218 switch(kind) {
9219 case PyUnicode_1BYTE_KIND:
9220 out = ucs1lib_rsplit(
9221 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9222 break;
9223 case PyUnicode_2BYTE_KIND:
9224 out = ucs2lib_rsplit(
9225 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9226 break;
9227 case PyUnicode_4BYTE_KIND:
9228 out = ucs4lib_rsplit(
9229 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9230 break;
9231 default:
9232 out = NULL;
9233 }
9234 if (kind1 != kind)
9235 PyMem_Free(buf1);
9236 if (kind2 != kind)
9237 PyMem_Free(buf2);
9238 return out;
9239}
9240
9241static Py_ssize_t
9242anylib_find(int kind, void *buf1, Py_ssize_t len1,
9243 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9244{
9245 switch(kind) {
9246 case PyUnicode_1BYTE_KIND:
9247 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9248 case PyUnicode_2BYTE_KIND:
9249 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9250 case PyUnicode_4BYTE_KIND:
9251 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9252 }
9253 assert(0);
9254 return -1;
9255}
9256
9257static Py_ssize_t
9258anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9259 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9260{
9261 switch(kind) {
9262 case PyUnicode_1BYTE_KIND:
9263 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9264 case PyUnicode_2BYTE_KIND:
9265 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9266 case PyUnicode_4BYTE_KIND:
9267 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9268 }
9269 assert(0);
9270 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009271}
9272
Alexander Belopolsky40018472011-02-26 01:02:56 +00009273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274replace(PyObject *self, PyObject *str1,
9275 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 PyObject *u;
9278 char *sbuf = PyUnicode_DATA(self);
9279 char *buf1 = PyUnicode_DATA(str1);
9280 char *buf2 = PyUnicode_DATA(str2);
9281 int srelease = 0, release1 = 0, release2 = 0;
9282 int skind = PyUnicode_KIND(self);
9283 int kind1 = PyUnicode_KIND(str1);
9284 int kind2 = PyUnicode_KIND(str2);
9285 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9286 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9287 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288
9289 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009290 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009292 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 if (skind < kind1)
9295 /* substring too wide to be present */
9296 goto nothing;
9297
9298 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009299 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009300 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009302 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009304 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 Py_UCS4 u1, u2, maxchar;
9306 int mayshrink, rkind;
9307 u1 = PyUnicode_READ_CHAR(str1, 0);
9308 if (!findchar(sbuf, PyUnicode_KIND(self),
9309 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009310 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 u2 = PyUnicode_READ_CHAR(str2, 0);
9312 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9313 /* Replacing u1 with u2 may cause a maxchar reduction in the
9314 result string. */
9315 mayshrink = maxchar > 127;
9316 if (u2 > maxchar) {
9317 maxchar = u2;
9318 mayshrink = 0;
9319 }
9320 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009321 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009323 if (PyUnicode_CopyCharacters(u, 0,
9324 (PyObject*)self, 0, slen) < 0)
9325 {
9326 Py_DECREF(u);
9327 return NULL;
9328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 rkind = PyUnicode_KIND(u);
9330 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9331 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009332 if (--maxcount < 0)
9333 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 if (mayshrink) {
9337 PyObject *tmp = u;
9338 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9339 PyUnicode_GET_LENGTH(tmp));
9340 Py_DECREF(tmp);
9341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 int rkind = skind;
9344 char *res;
9345 if (kind1 < rkind) {
9346 /* widen substring */
9347 buf1 = _PyUnicode_AsKind(str1, rkind);
9348 if (!buf1) goto error;
9349 release1 = 1;
9350 }
9351 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 if (i < 0)
9353 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 if (rkind > kind2) {
9355 /* widen replacement */
9356 buf2 = _PyUnicode_AsKind(str2, rkind);
9357 if (!buf2) goto error;
9358 release2 = 1;
9359 }
9360 else if (rkind < kind2) {
9361 /* widen self and buf1 */
9362 rkind = kind2;
9363 if (release1) PyMem_Free(buf1);
9364 sbuf = _PyUnicode_AsKind(self, rkind);
9365 if (!sbuf) goto error;
9366 srelease = 1;
9367 buf1 = _PyUnicode_AsKind(str1, rkind);
9368 if (!buf1) goto error;
9369 release1 = 1;
9370 }
9371 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9372 if (!res) {
9373 PyErr_NoMemory();
9374 goto error;
9375 }
9376 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009377 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9379 buf2,
9380 PyUnicode_KIND_SIZE(rkind, len2));
9381 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009382
9383 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9385 slen-i,
9386 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009387 if (i == -1)
9388 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9390 buf2,
9391 PyUnicode_KIND_SIZE(rkind, len2));
9392 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394
9395 u = PyUnicode_FromKindAndData(rkind, res, slen);
9396 PyMem_Free(res);
9397 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 Py_ssize_t n, i, j, ires;
9402 Py_ssize_t product, new_size;
9403 int rkind = skind;
9404 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 if (kind1 < rkind) {
9407 buf1 = _PyUnicode_AsKind(str1, rkind);
9408 if (!buf1) goto error;
9409 release1 = 1;
9410 }
9411 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009412 if (n == 0)
9413 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 if (kind2 < rkind) {
9415 buf2 = _PyUnicode_AsKind(str2, rkind);
9416 if (!buf2) goto error;
9417 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 else if (kind2 > rkind) {
9420 rkind = kind2;
9421 sbuf = _PyUnicode_AsKind(self, rkind);
9422 if (!sbuf) goto error;
9423 srelease = 1;
9424 if (release1) PyMem_Free(buf1);
9425 buf1 = _PyUnicode_AsKind(str1, rkind);
9426 if (!buf1) goto error;
9427 release1 = 1;
9428 }
9429 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9430 PyUnicode_GET_LENGTH(str1))); */
9431 product = n * (len2-len1);
9432 if ((product / (len2-len1)) != n) {
9433 PyErr_SetString(PyExc_OverflowError,
9434 "replace string is too long");
9435 goto error;
9436 }
9437 new_size = slen + product;
9438 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9439 PyErr_SetString(PyExc_OverflowError,
9440 "replace string is too long");
9441 goto error;
9442 }
9443 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9444 if (!res)
9445 goto error;
9446 ires = i = 0;
9447 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448 while (n-- > 0) {
9449 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 j = anylib_find(rkind,
9451 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9452 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009453 if (j == -1)
9454 break;
9455 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009456 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9458 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9459 PyUnicode_KIND_SIZE(rkind, j-i));
9460 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009461 }
9462 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (len2 > 0) {
9464 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9465 buf2,
9466 PyUnicode_KIND_SIZE(rkind, len2));
9467 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009472 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9474 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9475 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009476 } else {
9477 /* interleave */
9478 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9480 buf2,
9481 PyUnicode_KIND_SIZE(rkind, len2));
9482 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483 if (--n <= 0)
9484 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9486 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9487 PyUnicode_KIND_SIZE(rkind, 1));
9488 ires++;
9489 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9492 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9493 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009496 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (srelease)
9499 PyMem_FREE(sbuf);
9500 if (release1)
9501 PyMem_FREE(buf1);
9502 if (release2)
9503 PyMem_FREE(buf2);
9504 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009505
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009507 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 if (srelease)
9509 PyMem_FREE(sbuf);
9510 if (release1)
9511 PyMem_FREE(buf1);
9512 if (release2)
9513 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009514 if (PyUnicode_CheckExact(self)) {
9515 Py_INCREF(self);
9516 return (PyObject *) self;
9517 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009518 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 error:
9520 if (srelease && sbuf)
9521 PyMem_FREE(sbuf);
9522 if (release1 && buf1)
9523 PyMem_FREE(buf1);
9524 if (release2 && buf2)
9525 PyMem_FREE(buf2);
9526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527}
9528
9529/* --- Unicode Object Methods --------------------------------------------- */
9530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009531PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533\n\
9534Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009535characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
9537static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009538unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 return fixup(self, fixtitle);
9541}
9542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009543PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545\n\
9546Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009547have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548
9549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009550unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 return fixup(self, fixcapitalize);
9553}
9554
9555#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009556PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558\n\
9559Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009560normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561
9562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009563unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564{
9565 PyObject *list;
9566 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009567 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 /* Split into words */
9570 list = split(self, NULL, -1);
9571 if (!list)
9572 return NULL;
9573
9574 /* Capitalize each word */
9575 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9576 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 if (item == NULL)
9579 goto onError;
9580 Py_DECREF(PyList_GET_ITEM(list, i));
9581 PyList_SET_ITEM(list, i, item);
9582 }
9583
9584 /* Join the words to form a new string */
9585 item = PyUnicode_Join(NULL, list);
9586
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 Py_DECREF(list);
9589 return (PyObject *)item;
9590}
9591#endif
9592
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009593/* Argument converter. Coerces to a single unicode character */
9594
9595static int
9596convert_uc(PyObject *obj, void *addr)
9597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009599 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009600
Benjamin Peterson14339b62009-01-31 16:36:08 +00009601 uniobj = PyUnicode_FromObject(obj);
9602 if (uniobj == NULL) {
9603 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 return 0;
9606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009608 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 Py_DECREF(uniobj);
9611 return 0;
9612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009614 Py_DECREF(uniobj);
9615 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009616}
9617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009618PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009621Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009622done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623
9624static PyObject *
9625unicode_center(PyUnicodeObject *self, PyObject *args)
9626{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009627 Py_ssize_t marg, left;
9628 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 Py_UCS4 fillchar = ' ';
9630
Victor Stinnere9a29352011-10-01 02:14:59 +02009631 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Victor Stinnere9a29352011-10-01 02:14:59 +02009634 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 return NULL;
9636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 Py_INCREF(self);
9639 return (PyObject*) self;
9640 }
9641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 left = marg / 2 + (marg & width & 1);
9644
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009645 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646}
9647
Marc-André Lemburge5034372000-08-08 08:04:29 +00009648#if 0
9649
9650/* This code should go into some future Unicode collation support
9651 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009652 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009653
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009654/* speedy UTF-16 code point order comparison */
9655/* gleaned from: */
9656/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9657
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009658static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009659{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009660 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009661 0, 0, 0, 0, 0, 0, 0, 0,
9662 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009663 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009664};
9665
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666static int
9667unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009669 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009670
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 Py_UNICODE *s1 = str1->str;
9672 Py_UNICODE *s2 = str2->str;
9673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 len1 = str1->_base._base.length;
9675 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009676
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009678 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009679
9680 c1 = *s1++;
9681 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009682
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 if (c1 > (1<<11) * 26)
9684 c1 += utf16Fixup[c1>>11];
9685 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009686 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009687 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009688
9689 if (c1 != c2)
9690 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009691
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009692 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
9694
9695 return (len1 < len2) ? -1 : (len1 != len2);
9696}
9697
Marc-André Lemburge5034372000-08-08 08:04:29 +00009698#else
9699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700/* This function assumes that str1 and str2 are readied by the caller. */
9701
Marc-André Lemburge5034372000-08-08 08:04:29 +00009702static int
9703unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 int kind1, kind2;
9706 void *data1, *data2;
9707 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 kind1 = PyUnicode_KIND(str1);
9710 kind2 = PyUnicode_KIND(str2);
9711 data1 = PyUnicode_DATA(str1);
9712 data2 = PyUnicode_DATA(str2);
9713 len1 = PyUnicode_GET_LENGTH(str1);
9714 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 for (i = 0; i < len1 && i < len2; ++i) {
9717 Py_UCS4 c1, c2;
9718 c1 = PyUnicode_READ(kind1, data1, i);
9719 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009720
9721 if (c1 != c2)
9722 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009723 }
9724
9725 return (len1 < len2) ? -1 : (len1 != len2);
9726}
9727
9728#endif
9729
Alexander Belopolsky40018472011-02-26 01:02:56 +00009730int
9731PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9734 if (PyUnicode_READY(left) == -1 ||
9735 PyUnicode_READY(right) == -1)
9736 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009737 return unicode_compare((PyUnicodeObject *)left,
9738 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009740 PyErr_Format(PyExc_TypeError,
9741 "Can't compare %.100s and %.100s",
9742 left->ob_type->tp_name,
9743 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744 return -1;
9745}
9746
Martin v. Löwis5b222132007-06-10 09:51:05 +00009747int
9748PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 Py_ssize_t i;
9751 int kind;
9752 void *data;
9753 Py_UCS4 chr;
9754
Victor Stinner910337b2011-10-03 03:20:16 +02009755 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 if (PyUnicode_READY(uni) == -1)
9757 return -1;
9758 kind = PyUnicode_KIND(uni);
9759 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009760 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9762 if (chr != str[i])
9763 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009764 /* This check keeps Python strings that end in '\0' from comparing equal
9765 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009768 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009770 return 0;
9771}
9772
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009773
Benjamin Peterson29060642009-01-31 22:14:21 +00009774#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009775 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009776
Alexander Belopolsky40018472011-02-26 01:02:56 +00009777PyObject *
9778PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009779{
9780 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009782 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9783 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 if (PyUnicode_READY(left) == -1 ||
9785 PyUnicode_READY(right) == -1)
9786 return NULL;
9787 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9788 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009789 if (op == Py_EQ) {
9790 Py_INCREF(Py_False);
9791 return Py_False;
9792 }
9793 if (op == Py_NE) {
9794 Py_INCREF(Py_True);
9795 return Py_True;
9796 }
9797 }
9798 if (left == right)
9799 result = 0;
9800 else
9801 result = unicode_compare((PyUnicodeObject *)left,
9802 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009803
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009804 /* Convert the return value to a Boolean */
9805 switch (op) {
9806 case Py_EQ:
9807 v = TEST_COND(result == 0);
9808 break;
9809 case Py_NE:
9810 v = TEST_COND(result != 0);
9811 break;
9812 case Py_LE:
9813 v = TEST_COND(result <= 0);
9814 break;
9815 case Py_GE:
9816 v = TEST_COND(result >= 0);
9817 break;
9818 case Py_LT:
9819 v = TEST_COND(result == -1);
9820 break;
9821 case Py_GT:
9822 v = TEST_COND(result == 1);
9823 break;
9824 default:
9825 PyErr_BadArgument();
9826 return NULL;
9827 }
9828 Py_INCREF(v);
9829 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831
Brian Curtindfc80e32011-08-10 20:28:54 -05009832 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009833}
9834
Alexander Belopolsky40018472011-02-26 01:02:56 +00009835int
9836PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009837{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009838 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 int kind1, kind2, kind;
9840 void *buf1, *buf2;
9841 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009842 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009843
9844 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009845 sub = PyUnicode_FromObject(element);
9846 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 PyErr_Format(PyExc_TypeError,
9848 "'in <string>' requires string as left operand, not %s",
9849 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009850 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (PyUnicode_READY(sub) == -1)
9853 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009854
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009856 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009857 Py_DECREF(sub);
9858 return -1;
9859 }
9860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 kind1 = PyUnicode_KIND(str);
9862 kind2 = PyUnicode_KIND(sub);
9863 kind = kind1 > kind2 ? kind1 : kind2;
9864 buf1 = PyUnicode_DATA(str);
9865 buf2 = PyUnicode_DATA(sub);
9866 if (kind1 != kind)
9867 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9868 if (!buf1) {
9869 Py_DECREF(sub);
9870 return -1;
9871 }
9872 if (kind2 != kind)
9873 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9874 if (!buf2) {
9875 Py_DECREF(sub);
9876 if (kind1 != kind) PyMem_Free(buf1);
9877 return -1;
9878 }
9879 len1 = PyUnicode_GET_LENGTH(str);
9880 len2 = PyUnicode_GET_LENGTH(sub);
9881
9882 switch(kind) {
9883 case PyUnicode_1BYTE_KIND:
9884 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9885 break;
9886 case PyUnicode_2BYTE_KIND:
9887 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9888 break;
9889 case PyUnicode_4BYTE_KIND:
9890 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9891 break;
9892 default:
9893 result = -1;
9894 assert(0);
9895 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009896
9897 Py_DECREF(str);
9898 Py_DECREF(sub);
9899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (kind1 != kind)
9901 PyMem_Free(buf1);
9902 if (kind2 != kind)
9903 PyMem_Free(buf2);
9904
Guido van Rossum403d68b2000-03-13 15:55:09 +00009905 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009906}
9907
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908/* Concat to string or Unicode object giving a new Unicode object. */
9909
Alexander Belopolsky40018472011-02-26 01:02:56 +00009910PyObject *
9911PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 PyObject *u = NULL, *v = NULL, *w;
9914 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915
9916 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923
9924 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009925 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009929 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 }
9933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009935 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 w = PyUnicode_New(
9939 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9940 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009943 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9944 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009945 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009946 v, 0,
9947 PyUnicode_GET_LENGTH(v)) < 0)
9948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 Py_DECREF(u);
9950 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954 Py_XDECREF(u);
9955 Py_XDECREF(v);
9956 return NULL;
9957}
9958
Walter Dörwald1ab83302007-05-18 17:15:44 +00009959void
Victor Stinner23e56682011-10-03 03:54:37 +02009960PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009961{
Victor Stinner23e56682011-10-03 03:54:37 +02009962 PyObject *left, *res;
9963
9964 if (p_left == NULL) {
9965 if (!PyErr_Occurred())
9966 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009967 return;
9968 }
Victor Stinner23e56682011-10-03 03:54:37 +02009969 left = *p_left;
9970 if (right == NULL || !PyUnicode_Check(left)) {
9971 if (!PyErr_Occurred())
9972 PyErr_BadInternalCall();
9973 goto error;
9974 }
9975
9976 if (PyUnicode_CheckExact(left) && left != unicode_empty
9977 && PyUnicode_CheckExact(right) && right != unicode_empty
9978 && unicode_resizable(left)
9979 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9980 || _PyUnicode_WSTR(left) != NULL))
9981 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009982 Py_ssize_t left_len, right_len, new_len;
9983#ifdef Py_DEBUG
9984 Py_ssize_t copied;
9985#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009986
Victor Stinner23e56682011-10-03 03:54:37 +02009987 if (PyUnicode_READY(left))
9988 goto error;
9989 if (PyUnicode_READY(right))
9990 goto error;
9991
9992 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9993 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9994 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009995 left_len = PyUnicode_GET_LENGTH(left);
9996 right_len = PyUnicode_GET_LENGTH(right);
9997 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +02009998 PyErr_SetString(PyExc_OverflowError,
9999 "strings are too large to concat");
10000 goto error;
10001 }
Victor Stinnerb8038952011-10-03 23:27:56 +020010002 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010003
10004 /* Now we own the last reference to 'left', so we can resize it
10005 * in-place.
10006 */
10007 if (unicode_resize(&left, new_len) != 0) {
10008 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10009 * deallocated so it cannot be put back into
10010 * 'variable'. The MemoryError is raised when there
10011 * is no value in 'variable', which might (very
10012 * remotely) be a cause of incompatibilities.
10013 */
10014 goto error;
10015 }
10016 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +020010017#ifdef Py_DEBUG
10018 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +020010019 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +020010020 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010021 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +020010022#else
10023 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10024#endif
Victor Stinner23e56682011-10-03 03:54:37 +020010025 *p_left = left;
10026 return;
10027 }
10028 }
10029
10030 res = PyUnicode_Concat(left, right);
10031 if (res == NULL)
10032 goto error;
10033 Py_DECREF(left);
10034 *p_left = res;
10035 return;
10036
10037error:
10038 Py_DECREF(*p_left);
10039 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010040}
10041
10042void
10043PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10044{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010045 PyUnicode_Append(pleft, right);
10046 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010047}
10048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010049PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010052Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010053string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010054interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055
10056static PyObject *
10057unicode_count(PyUnicodeObject *self, PyObject *args)
10058{
10059 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010060 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010061 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 int kind1, kind2, kind;
10064 void *buf1, *buf2;
10065 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066
Jesus Ceaac451502011-04-20 17:09:23 +020010067 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10068 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 kind1 = PyUnicode_KIND(self);
10072 kind2 = PyUnicode_KIND(substring);
10073 kind = kind1 > kind2 ? kind1 : kind2;
10074 buf1 = PyUnicode_DATA(self);
10075 buf2 = PyUnicode_DATA(substring);
10076 if (kind1 != kind)
10077 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10078 if (!buf1) {
10079 Py_DECREF(substring);
10080 return NULL;
10081 }
10082 if (kind2 != kind)
10083 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10084 if (!buf2) {
10085 Py_DECREF(substring);
10086 if (kind1 != kind) PyMem_Free(buf1);
10087 return NULL;
10088 }
10089 len1 = PyUnicode_GET_LENGTH(self);
10090 len2 = PyUnicode_GET_LENGTH(substring);
10091
10092 ADJUST_INDICES(start, end, len1);
10093 switch(kind) {
10094 case PyUnicode_1BYTE_KIND:
10095 iresult = ucs1lib_count(
10096 ((Py_UCS1*)buf1) + start, end - start,
10097 buf2, len2, PY_SSIZE_T_MAX
10098 );
10099 break;
10100 case PyUnicode_2BYTE_KIND:
10101 iresult = ucs2lib_count(
10102 ((Py_UCS2*)buf1) + start, end - start,
10103 buf2, len2, PY_SSIZE_T_MAX
10104 );
10105 break;
10106 case PyUnicode_4BYTE_KIND:
10107 iresult = ucs4lib_count(
10108 ((Py_UCS4*)buf1) + start, end - start,
10109 buf2, len2, PY_SSIZE_T_MAX
10110 );
10111 break;
10112 default:
10113 assert(0); iresult = 0;
10114 }
10115
10116 result = PyLong_FromSsize_t(iresult);
10117
10118 if (kind1 != kind)
10119 PyMem_Free(buf1);
10120 if (kind2 != kind)
10121 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122
10123 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010124
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 return result;
10126}
10127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010128PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010129 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010131Encode S using the codec registered for encoding. Default encoding\n\
10132is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010133handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010134a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10135'xmlcharrefreplace' as well as any other name registered with\n\
10136codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137
10138static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010139unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010141 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 char *encoding = NULL;
10143 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010144
Benjamin Peterson308d6372009-09-18 21:42:35 +000010145 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10146 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010148 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010149}
10150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010151PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153\n\
10154Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010155If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
10157static PyObject*
10158unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10159{
10160 Py_UNICODE *e;
10161 Py_UNICODE *p;
10162 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010163 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165 PyUnicodeObject *u;
10166 int tabsize = 8;
10167
10168 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10172 return NULL;
10173
Thomas Wouters7e474022000-07-16 12:04:32 +000010174 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010175 i = 0; /* chars up to and including most recent \n or \r */
10176 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10178 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010180 if (tabsize > 0) {
10181 incr = tabsize - (j % tabsize); /* cannot overflow */
10182 if (j > PY_SSIZE_T_MAX - incr)
10183 goto overflow1;
10184 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010185 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 if (j > PY_SSIZE_T_MAX - 1)
10189 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 j++;
10191 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 if (i > PY_SSIZE_T_MAX - j)
10193 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010195 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196 }
10197 }
10198
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010199 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010200 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010201
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202 /* Second pass: create output string and fill it */
10203 u = _PyUnicode_New(i + j);
10204 if (!u)
10205 return NULL;
10206
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010207 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 q = _PyUnicode_WSTR(u); /* next output char */
10209 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010213 if (tabsize > 0) {
10214 i = tabsize - (j % tabsize);
10215 j += i;
10216 while (i--) {
10217 if (q >= qe)
10218 goto overflow2;
10219 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010220 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010222 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 else {
10224 if (q >= qe)
10225 goto overflow2;
10226 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010227 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 if (*p == '\n' || *p == '\r')
10229 j = 0;
10230 }
10231
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010232 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 Py_DECREF(u);
10234 return NULL;
10235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010237
10238 overflow2:
10239 Py_DECREF(u);
10240 overflow1:
10241 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243}
10244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010245PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247\n\
10248Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010249such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250arguments start and end are interpreted as in slice notation.\n\
10251\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010252Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
10254static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256{
Jesus Ceaac451502011-04-20 17:09:23 +020010257 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010258 Py_ssize_t start;
10259 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010260 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
Jesus Ceaac451502011-04-20 17:09:23 +020010262 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10263 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (PyUnicode_READY(self) == -1)
10267 return NULL;
10268 if (PyUnicode_READY(substring) == -1)
10269 return NULL;
10270
10271 result = any_find_slice(
10272 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10273 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010274 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
10276 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 if (result == -2)
10279 return NULL;
10280
Christian Heimes217cfd12007-12-02 14:31:20 +000010281 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282}
10283
10284static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010285unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010287 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10288 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291}
10292
Guido van Rossumc2504932007-09-18 19:42:40 +000010293/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010294 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010295static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010296unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297{
Guido van Rossumc2504932007-09-18 19:42:40 +000010298 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010299 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 if (_PyUnicode_HASH(self) != -1)
10302 return _PyUnicode_HASH(self);
10303 if (PyUnicode_READY(self) == -1)
10304 return -1;
10305 len = PyUnicode_GET_LENGTH(self);
10306
10307 /* The hash function as a macro, gets expanded three times below. */
10308#define HASH(P) \
10309 x = (Py_uhash_t)*P << 7; \
10310 while (--len >= 0) \
10311 x = (1000003*x) ^ (Py_uhash_t)*P++;
10312
10313 switch (PyUnicode_KIND(self)) {
10314 case PyUnicode_1BYTE_KIND: {
10315 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10316 HASH(c);
10317 break;
10318 }
10319 case PyUnicode_2BYTE_KIND: {
10320 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10321 HASH(s);
10322 break;
10323 }
10324 default: {
10325 Py_UCS4 *l;
10326 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10327 "Impossible switch case in unicode_hash");
10328 l = PyUnicode_4BYTE_DATA(self);
10329 HASH(l);
10330 break;
10331 }
10332 }
10333 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10334
Guido van Rossumc2504932007-09-18 19:42:40 +000010335 if (x == -1)
10336 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010338 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010345Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346
10347static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010350 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010351 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010352 Py_ssize_t start;
10353 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354
Jesus Ceaac451502011-04-20 17:09:23 +020010355 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10356 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (PyUnicode_READY(self) == -1)
10360 return NULL;
10361 if (PyUnicode_READY(substring) == -1)
10362 return NULL;
10363
10364 result = any_find_slice(
10365 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10366 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (result == -2)
10372 return NULL;
10373
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 if (result < 0) {
10375 PyErr_SetString(PyExc_ValueError, "substring not found");
10376 return NULL;
10377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010378
Christian Heimes217cfd12007-12-02 14:31:20 +000010379 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380}
10381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010382PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010383 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010385Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010386at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387
10388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010389unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 Py_ssize_t i, length;
10392 int kind;
10393 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 int cased;
10395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 if (PyUnicode_READY(self) == -1)
10397 return NULL;
10398 length = PyUnicode_GET_LENGTH(self);
10399 kind = PyUnicode_KIND(self);
10400 data = PyUnicode_DATA(self);
10401
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 if (length == 1)
10404 return PyBool_FromLong(
10405 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010407 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010409 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010410
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 for (i = 0; i < length; i++) {
10413 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010414
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10416 return PyBool_FromLong(0);
10417 else if (!cased && Py_UNICODE_ISLOWER(ch))
10418 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421}
10422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010423PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010426Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010427at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010430unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 Py_ssize_t i, length;
10433 int kind;
10434 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 int cased;
10436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (PyUnicode_READY(self) == -1)
10438 return NULL;
10439 length = PyUnicode_GET_LENGTH(self);
10440 kind = PyUnicode_KIND(self);
10441 data = PyUnicode_DATA(self);
10442
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (length == 1)
10445 return PyBool_FromLong(
10446 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010448 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010450 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010451
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 for (i = 0; i < length; i++) {
10454 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010455
Benjamin Peterson29060642009-01-31 22:14:21 +000010456 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10457 return PyBool_FromLong(0);
10458 else if (!cased && Py_UNICODE_ISUPPER(ch))
10459 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010461 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462}
10463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010464PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010467Return True if S is a titlecased string and there is at least one\n\
10468character in S, i.e. upper- and titlecase characters may only\n\
10469follow uncased characters and lowercase characters only cased ones.\n\
10470Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
10472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010473unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 Py_ssize_t i, length;
10476 int kind;
10477 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478 int cased, previous_is_cased;
10479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 if (PyUnicode_READY(self) == -1)
10481 return NULL;
10482 length = PyUnicode_GET_LENGTH(self);
10483 kind = PyUnicode_KIND(self);
10484 data = PyUnicode_DATA(self);
10485
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (length == 1) {
10488 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10489 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10490 (Py_UNICODE_ISUPPER(ch) != 0));
10491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010493 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010496
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 cased = 0;
10498 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 for (i = 0; i < length; i++) {
10500 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010501
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10503 if (previous_is_cased)
10504 return PyBool_FromLong(0);
10505 previous_is_cased = 1;
10506 cased = 1;
10507 }
10508 else if (Py_UNICODE_ISLOWER(ch)) {
10509 if (!previous_is_cased)
10510 return PyBool_FromLong(0);
10511 previous_is_cased = 1;
10512 cased = 1;
10513 }
10514 else
10515 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010517 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518}
10519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010520PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010523Return True if all characters in S are whitespace\n\
10524and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
10526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010527unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 Py_ssize_t i, length;
10530 int kind;
10531 void *data;
10532
10533 if (PyUnicode_READY(self) == -1)
10534 return NULL;
10535 length = PyUnicode_GET_LENGTH(self);
10536 kind = PyUnicode_KIND(self);
10537 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (length == 1)
10541 return PyBool_FromLong(
10542 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010544 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 for (i = 0; i < length; i++) {
10549 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010550 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010551 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010553 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554}
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010558\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010559Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010561
10562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010563unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_ssize_t i, length;
10566 int kind;
10567 void *data;
10568
10569 if (PyUnicode_READY(self) == -1)
10570 return NULL;
10571 length = PyUnicode_GET_LENGTH(self);
10572 kind = PyUnicode_KIND(self);
10573 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010574
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010575 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (length == 1)
10577 return PyBool_FromLong(
10578 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010579
10580 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 for (i = 0; i < length; i++) {
10585 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010586 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010587 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010588 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010589}
10590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010591PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010593\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010594Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010595and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010596
10597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010598unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 int kind;
10601 void *data;
10602 Py_ssize_t len, i;
10603
10604 if (PyUnicode_READY(self) == -1)
10605 return NULL;
10606
10607 kind = PyUnicode_KIND(self);
10608 data = PyUnicode_DATA(self);
10609 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010610
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010611 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (len == 1) {
10613 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10614 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10615 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010616
10617 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 for (i = 0; i < len; i++) {
10622 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010623 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010625 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010626 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010627}
10628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010632Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010633False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
10635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010636unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 Py_ssize_t i, length;
10639 int kind;
10640 void *data;
10641
10642 if (PyUnicode_READY(self) == -1)
10643 return NULL;
10644 length = PyUnicode_GET_LENGTH(self);
10645 kind = PyUnicode_KIND(self);
10646 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (length == 1)
10650 return PyBool_FromLong(
10651 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010653 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 for (i = 0; i < length; i++) {
10658 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010661 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662}
10663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010664PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010667Return True if all characters in S are digits\n\
10668and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
10670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010671unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 Py_ssize_t i, length;
10674 int kind;
10675 void *data;
10676
10677 if (PyUnicode_READY(self) == -1)
10678 return NULL;
10679 length = PyUnicode_GET_LENGTH(self);
10680 kind = PyUnicode_KIND(self);
10681 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (length == 1) {
10685 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10686 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010689 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 for (i = 0; i < length; i++) {
10694 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010697 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698}
10699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010703Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010704False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705
10706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010707unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 Py_ssize_t i, length;
10710 int kind;
10711 void *data;
10712
10713 if (PyUnicode_READY(self) == -1)
10714 return NULL;
10715 length = PyUnicode_GET_LENGTH(self);
10716 kind = PyUnicode_KIND(self);
10717 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (length == 1)
10721 return PyBool_FromLong(
10722 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010724 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 for (i = 0; i < length; i++) {
10729 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010732 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733}
10734
Martin v. Löwis47383402007-08-15 07:32:56 +000010735int
10736PyUnicode_IsIdentifier(PyObject *self)
10737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 int kind;
10739 void *data;
10740 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010741 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (PyUnicode_READY(self) == -1) {
10744 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 }
10747
10748 /* Special case for empty strings */
10749 if (PyUnicode_GET_LENGTH(self) == 0)
10750 return 0;
10751 kind = PyUnicode_KIND(self);
10752 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010753
10754 /* PEP 3131 says that the first character must be in
10755 XID_Start and subsequent characters in XID_Continue,
10756 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010757 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010758 letters, digits, underscore). However, given the current
10759 definition of XID_Start and XID_Continue, it is sufficient
10760 to check just for these, except that _ must be allowed
10761 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010763 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010764 return 0;
10765
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010766 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010769 return 1;
10770}
10771
10772PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010774\n\
10775Return True if S is a valid identifier according\n\
10776to the language definition.");
10777
10778static PyObject*
10779unicode_isidentifier(PyObject *self)
10780{
10781 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10782}
10783
Georg Brandl559e5d72008-06-11 18:37:52 +000010784PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010786\n\
10787Return True if all characters in S are considered\n\
10788printable in repr() or S is empty, False otherwise.");
10789
10790static PyObject*
10791unicode_isprintable(PyObject *self)
10792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_ssize_t i, length;
10794 int kind;
10795 void *data;
10796
10797 if (PyUnicode_READY(self) == -1)
10798 return NULL;
10799 length = PyUnicode_GET_LENGTH(self);
10800 kind = PyUnicode_KIND(self);
10801 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010802
10803 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 if (length == 1)
10805 return PyBool_FromLong(
10806 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 for (i = 0; i < length; i++) {
10809 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010810 Py_RETURN_FALSE;
10811 }
10812 }
10813 Py_RETURN_TRUE;
10814}
10815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010816PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010817 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818\n\
10819Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010820iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
10822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010823unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010825 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826}
10827
Martin v. Löwis18e16552006-02-15 17:27:45 +000010828static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829unicode_length(PyUnicodeObject *self)
10830{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 if (PyUnicode_READY(self) == -1)
10832 return -1;
10833 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834}
10835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010836PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010839Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010840done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842static PyObject *
10843unicode_ljust(PyUnicodeObject *self, PyObject *args)
10844{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010845 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 Py_UCS4 fillchar = ' ';
10847
10848 if (PyUnicode_READY(self) == -1)
10849 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010850
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010851 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 return NULL;
10853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 Py_INCREF(self);
10856 return (PyObject*) self;
10857 }
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860}
10861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010862PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010863 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010865Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
10867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010868unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870 return fixup(self, fixlower);
10871}
10872
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010873#define LEFTSTRIP 0
10874#define RIGHTSTRIP 1
10875#define BOTHSTRIP 2
10876
10877/* Arrays indexed by above */
10878static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10879
10880#define STRIPNAME(i) (stripformat[i]+3)
10881
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010882/* externally visible for str.strip(unicode) */
10883PyObject *
10884_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 void *data;
10887 int kind;
10888 Py_ssize_t i, j, len;
10889 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10892 return NULL;
10893
10894 kind = PyUnicode_KIND(self);
10895 data = PyUnicode_DATA(self);
10896 len = PyUnicode_GET_LENGTH(self);
10897 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10898 PyUnicode_DATA(sepobj),
10899 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010900
Benjamin Peterson14339b62009-01-31 16:36:08 +000010901 i = 0;
10902 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 while (i < len &&
10904 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 i++;
10906 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010907 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010908
Benjamin Peterson14339b62009-01-31 16:36:08 +000010909 j = len;
10910 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 do {
10912 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 } while (j >= i &&
10914 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010916 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010917
Victor Stinner12bab6d2011-10-01 01:53:49 +020010918 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919}
10920
10921PyObject*
10922PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10923{
10924 unsigned char *data;
10925 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010926 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927
Victor Stinnerde636f32011-10-01 03:55:54 +020010928 if (PyUnicode_READY(self) == -1)
10929 return NULL;
10930
10931 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10932
Victor Stinner12bab6d2011-10-01 01:53:49 +020010933 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010935 if (PyUnicode_CheckExact(self)) {
10936 Py_INCREF(self);
10937 return self;
10938 }
10939 else
10940 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 }
10942
Victor Stinner12bab6d2011-10-01 01:53:49 +020010943 length = end - start;
10944 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010945 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946
Victor Stinnerde636f32011-10-01 03:55:54 +020010947 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010948 PyErr_SetString(PyExc_IndexError, "string index out of range");
10949 return NULL;
10950 }
10951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 kind = PyUnicode_KIND(self);
10953 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010954 return PyUnicode_FromKindAndData(kind,
10955 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010956 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010960do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 int kind;
10963 void *data;
10964 Py_ssize_t len, i, j;
10965
10966 if (PyUnicode_READY(self) == -1)
10967 return NULL;
10968
10969 kind = PyUnicode_KIND(self);
10970 data = PyUnicode_DATA(self);
10971 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010972
Benjamin Peterson14339b62009-01-31 16:36:08 +000010973 i = 0;
10974 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010976 i++;
10977 }
10978 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010979
Benjamin Peterson14339b62009-01-31 16:36:08 +000010980 j = len;
10981 if (striptype != LEFTSTRIP) {
10982 do {
10983 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010985 j++;
10986 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010987
Victor Stinner12bab6d2011-10-01 01:53:49 +020010988 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989}
10990
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010991
10992static PyObject *
10993do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10994{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010995 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010996
Benjamin Peterson14339b62009-01-31 16:36:08 +000010997 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10998 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010999
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000 if (sep != NULL && sep != Py_None) {
11001 if (PyUnicode_Check(sep))
11002 return _PyUnicode_XStrip(self, striptype, sep);
11003 else {
11004 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 "%s arg must be None or str",
11006 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011007 return NULL;
11008 }
11009 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011010
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011012}
11013
11014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011015PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011017\n\
11018Return a copy of the string S with leading and trailing\n\
11019whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011020If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011021
11022static PyObject *
11023unicode_strip(PyUnicodeObject *self, PyObject *args)
11024{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011025 if (PyTuple_GET_SIZE(args) == 0)
11026 return do_strip(self, BOTHSTRIP); /* Common case */
11027 else
11028 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011029}
11030
11031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011032PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011034\n\
11035Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011036If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011037
11038static PyObject *
11039unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11040{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011041 if (PyTuple_GET_SIZE(args) == 0)
11042 return do_strip(self, LEFTSTRIP); /* Common case */
11043 else
11044 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011045}
11046
11047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011048PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011050\n\
11051Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011052If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011053
11054static PyObject *
11055unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11056{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011057 if (PyTuple_GET_SIZE(args) == 0)
11058 return do_strip(self, RIGHTSTRIP); /* Common case */
11059 else
11060 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011061}
11062
11063
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011065unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
11067 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
Georg Brandl222de0f2009-04-12 12:01:50 +000011070 if (len < 1) {
11071 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011072 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074
Tim Peters7a29bd52001-09-12 03:03:31 +000011075 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076 /* no repeat, return original string */
11077 Py_INCREF(str);
11078 return (PyObject*) str;
11079 }
Tim Peters8f422462000-09-09 06:13:41 +000011080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (PyUnicode_READY(str) == -1)
11082 return NULL;
11083
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011084 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011085 PyErr_SetString(PyExc_OverflowError,
11086 "repeated string is too long");
11087 return NULL;
11088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092 if (!u)
11093 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011094 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (PyUnicode_GET_LENGTH(str) == 1) {
11097 const int kind = PyUnicode_KIND(str);
11098 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11099 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011100 if (kind == PyUnicode_1BYTE_KIND)
11101 memset(to, (unsigned char)fill_char, len);
11102 else {
11103 for (n = 0; n < len; ++n)
11104 PyUnicode_WRITE(kind, to, n, fill_char);
11105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 }
11107 else {
11108 /* number of characters copied this far */
11109 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11110 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11111 char *to = (char *) PyUnicode_DATA(u);
11112 Py_MEMCPY(to, PyUnicode_DATA(str),
11113 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 n = (done <= nchars-done) ? done : nchars-done;
11116 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119 }
11120
11121 return (PyObject*) u;
11122}
11123
Alexander Belopolsky40018472011-02-26 01:02:56 +000011124PyObject *
11125PyUnicode_Replace(PyObject *obj,
11126 PyObject *subobj,
11127 PyObject *replobj,
11128 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129{
11130 PyObject *self;
11131 PyObject *str1;
11132 PyObject *str2;
11133 PyObject *result;
11134
11135 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011136 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011139 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 Py_DECREF(self);
11141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 }
11143 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011144 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 Py_DECREF(self);
11146 Py_DECREF(str1);
11147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150 Py_DECREF(self);
11151 Py_DECREF(str1);
11152 Py_DECREF(str2);
11153 return result;
11154}
11155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011157 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158\n\
11159Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011160old replaced by new. If the optional argument count is\n\
11161given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162
11163static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 PyObject *str1;
11167 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011168 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169 PyObject *result;
11170
Martin v. Löwis18e16552006-02-15 17:27:45 +000011171 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 str1 = PyUnicode_FromObject(str1);
11176 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11177 return NULL;
11178 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011179 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 Py_DECREF(str1);
11181 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
11184 result = replace(self, str1, str2, maxcount);
11185
11186 Py_DECREF(str1);
11187 Py_DECREF(str2);
11188 return result;
11189}
11190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191static PyObject *
11192unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011194 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_ssize_t isize;
11196 Py_ssize_t osize, squote, dquote, i, o;
11197 Py_UCS4 max, quote;
11198 int ikind, okind;
11199 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011202 return NULL;
11203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 isize = PyUnicode_GET_LENGTH(unicode);
11205 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 /* Compute length of output, quote characters, and
11208 maximum character */
11209 osize = 2; /* quotes */
11210 max = 127;
11211 squote = dquote = 0;
11212 ikind = PyUnicode_KIND(unicode);
11213 for (i = 0; i < isize; i++) {
11214 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11215 switch (ch) {
11216 case '\'': squote++; osize++; break;
11217 case '"': dquote++; osize++; break;
11218 case '\\': case '\t': case '\r': case '\n':
11219 osize += 2; break;
11220 default:
11221 /* Fast-path ASCII */
11222 if (ch < ' ' || ch == 0x7f)
11223 osize += 4; /* \xHH */
11224 else if (ch < 0x7f)
11225 osize++;
11226 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11227 osize++;
11228 max = ch > max ? ch : max;
11229 }
11230 else if (ch < 0x100)
11231 osize += 4; /* \xHH */
11232 else if (ch < 0x10000)
11233 osize += 6; /* \uHHHH */
11234 else
11235 osize += 10; /* \uHHHHHHHH */
11236 }
11237 }
11238
11239 quote = '\'';
11240 if (squote) {
11241 if (dquote)
11242 /* Both squote and dquote present. Use squote,
11243 and escape them */
11244 osize += squote;
11245 else
11246 quote = '"';
11247 }
11248
11249 repr = PyUnicode_New(osize, max);
11250 if (repr == NULL)
11251 return NULL;
11252 okind = PyUnicode_KIND(repr);
11253 odata = PyUnicode_DATA(repr);
11254
11255 PyUnicode_WRITE(okind, odata, 0, quote);
11256 PyUnicode_WRITE(okind, odata, osize-1, quote);
11257
11258 for (i = 0, o = 1; i < isize; i++) {
11259 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011260
11261 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if ((ch == quote) || (ch == '\\')) {
11263 PyUnicode_WRITE(okind, odata, o++, '\\');
11264 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011265 continue;
11266 }
11267
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011269 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 PyUnicode_WRITE(okind, odata, o++, '\\');
11271 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011272 }
11273 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 PyUnicode_WRITE(okind, odata, o++, '\\');
11275 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011276 }
11277 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 PyUnicode_WRITE(okind, odata, o++, '\\');
11279 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011280 }
11281
11282 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011283 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 PyUnicode_WRITE(okind, odata, o++, '\\');
11285 PyUnicode_WRITE(okind, odata, o++, 'x');
11286 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11287 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011288 }
11289
Georg Brandl559e5d72008-06-11 18:37:52 +000011290 /* Copy ASCII characters as-is */
11291 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011293 }
11294
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011296 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011297 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011298 (categories Z* and C* except ASCII space)
11299 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011301 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (ch <= 0xff) {
11303 PyUnicode_WRITE(okind, odata, o++, '\\');
11304 PyUnicode_WRITE(okind, odata, o++, 'x');
11305 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11306 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011307 }
11308 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 else if (ch >= 0x10000) {
11310 PyUnicode_WRITE(okind, odata, o++, '\\');
11311 PyUnicode_WRITE(okind, odata, o++, 'U');
11312 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11313 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11314 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11315 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11316 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11317 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11318 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11319 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011320 }
11321 /* Map 16-bit characters to '\uxxxx' */
11322 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 PyUnicode_WRITE(okind, odata, o++, '\\');
11324 PyUnicode_WRITE(okind, odata, o++, 'u');
11325 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11326 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11327 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11328 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011329 }
11330 }
11331 /* Copy characters as-is */
11332 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011334 }
11335 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011338 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339}
11340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011341PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343\n\
11344Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011345such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346arguments start and end are interpreted as in slice notation.\n\
11347\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
11350static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Jesus Ceaac451502011-04-20 17:09:23 +020011353 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011354 Py_ssize_t start;
11355 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011356 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
Jesus Ceaac451502011-04-20 17:09:23 +020011358 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11359 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 if (PyUnicode_READY(substring) == -1)
11365 return NULL;
11366
11367 result = any_find_slice(
11368 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11369 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011370 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (result == -2)
11375 return NULL;
11376
Christian Heimes217cfd12007-12-02 14:31:20 +000011377 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378}
11379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011380PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011383Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384
11385static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387{
Jesus Ceaac451502011-04-20 17:09:23 +020011388 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011389 Py_ssize_t start;
11390 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011391 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
Jesus Ceaac451502011-04-20 17:09:23 +020011393 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11394 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 if (PyUnicode_READY(substring) == -1)
11400 return NULL;
11401
11402 result = any_find_slice(
11403 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11404 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011405 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
11407 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (result == -2)
11410 return NULL;
11411
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 if (result < 0) {
11413 PyErr_SetString(PyExc_ValueError, "substring not found");
11414 return NULL;
11415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416
Christian Heimes217cfd12007-12-02 14:31:20 +000011417 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011423Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011424done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject *
11427unicode_rjust(PyUnicodeObject *self, PyObject *args)
11428{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011429 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 Py_UCS4 fillchar = ' ';
11431
Victor Stinnere9a29352011-10-01 02:14:59 +020011432 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011434
Victor Stinnere9a29352011-10-01 02:14:59 +020011435 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 return NULL;
11437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 Py_INCREF(self);
11440 return (PyObject*) self;
11441 }
11442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444}
11445
Alexander Belopolsky40018472011-02-26 01:02:56 +000011446PyObject *
11447PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448{
11449 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011450
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 s = PyUnicode_FromObject(s);
11452 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011453 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 if (sep != NULL) {
11455 sep = PyUnicode_FromObject(sep);
11456 if (sep == NULL) {
11457 Py_DECREF(s);
11458 return NULL;
11459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 }
11461
11462 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11463
11464 Py_DECREF(s);
11465 Py_XDECREF(sep);
11466 return result;
11467}
11468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011469PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471\n\
11472Return a list of the words in S, using sep as the\n\
11473delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011474splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011475whitespace string is a separator and empty strings are\n\
11476removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject*
11479unicode_split(PyUnicodeObject *self, PyObject *args)
11480{
11481 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011482 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Martin v. Löwis18e16552006-02-15 17:27:45 +000011484 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 return NULL;
11486
11487 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493}
11494
Thomas Wouters477c8d52006-05-27 19:21:47 +000011495PyObject *
11496PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11497{
11498 PyObject* str_obj;
11499 PyObject* sep_obj;
11500 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 int kind1, kind2, kind;
11502 void *buf1 = NULL, *buf2 = NULL;
11503 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011504
11505 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011506 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011508 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011510 Py_DECREF(str_obj);
11511 return NULL;
11512 }
11513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 kind1 = PyUnicode_KIND(str_in);
11515 kind2 = PyUnicode_KIND(sep_obj);
11516 kind = kind1 > kind2 ? kind1 : kind2;
11517 buf1 = PyUnicode_DATA(str_in);
11518 if (kind1 != kind)
11519 buf1 = _PyUnicode_AsKind(str_in, kind);
11520 if (!buf1)
11521 goto onError;
11522 buf2 = PyUnicode_DATA(sep_obj);
11523 if (kind2 != kind)
11524 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11525 if (!buf2)
11526 goto onError;
11527 len1 = PyUnicode_GET_LENGTH(str_obj);
11528 len2 = PyUnicode_GET_LENGTH(sep_obj);
11529
11530 switch(PyUnicode_KIND(str_in)) {
11531 case PyUnicode_1BYTE_KIND:
11532 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11533 break;
11534 case PyUnicode_2BYTE_KIND:
11535 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11536 break;
11537 case PyUnicode_4BYTE_KIND:
11538 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11539 break;
11540 default:
11541 assert(0);
11542 out = 0;
11543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544
11545 Py_DECREF(sep_obj);
11546 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (kind1 != kind)
11548 PyMem_Free(buf1);
11549 if (kind2 != kind)
11550 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011551
11552 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 onError:
11554 Py_DECREF(sep_obj);
11555 Py_DECREF(str_obj);
11556 if (kind1 != kind && buf1)
11557 PyMem_Free(buf1);
11558 if (kind2 != kind && buf2)
11559 PyMem_Free(buf2);
11560 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011561}
11562
11563
11564PyObject *
11565PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11566{
11567 PyObject* str_obj;
11568 PyObject* sep_obj;
11569 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 int kind1, kind2, kind;
11571 void *buf1 = NULL, *buf2 = NULL;
11572 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011573
11574 str_obj = PyUnicode_FromObject(str_in);
11575 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011577 sep_obj = PyUnicode_FromObject(sep_in);
11578 if (!sep_obj) {
11579 Py_DECREF(str_obj);
11580 return NULL;
11581 }
11582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 kind1 = PyUnicode_KIND(str_in);
11584 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011585 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 buf1 = PyUnicode_DATA(str_in);
11587 if (kind1 != kind)
11588 buf1 = _PyUnicode_AsKind(str_in, kind);
11589 if (!buf1)
11590 goto onError;
11591 buf2 = PyUnicode_DATA(sep_obj);
11592 if (kind2 != kind)
11593 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11594 if (!buf2)
11595 goto onError;
11596 len1 = PyUnicode_GET_LENGTH(str_obj);
11597 len2 = PyUnicode_GET_LENGTH(sep_obj);
11598
11599 switch(PyUnicode_KIND(str_in)) {
11600 case PyUnicode_1BYTE_KIND:
11601 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11602 break;
11603 case PyUnicode_2BYTE_KIND:
11604 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11605 break;
11606 case PyUnicode_4BYTE_KIND:
11607 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11608 break;
11609 default:
11610 assert(0);
11611 out = 0;
11612 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613
11614 Py_DECREF(sep_obj);
11615 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (kind1 != kind)
11617 PyMem_Free(buf1);
11618 if (kind2 != kind)
11619 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620
11621 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 onError:
11623 Py_DECREF(sep_obj);
11624 Py_DECREF(str_obj);
11625 if (kind1 != kind && buf1)
11626 PyMem_Free(buf1);
11627 if (kind2 != kind && buf2)
11628 PyMem_Free(buf2);
11629 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630}
11631
11632PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011635Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011636the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011637found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638
11639static PyObject*
11640unicode_partition(PyUnicodeObject *self, PyObject *separator)
11641{
11642 return PyUnicode_Partition((PyObject *)self, separator);
11643}
11644
11645PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011646 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011648Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011649the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011650separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651
11652static PyObject*
11653unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11654{
11655 return PyUnicode_RPartition((PyObject *)self, separator);
11656}
11657
Alexander Belopolsky40018472011-02-26 01:02:56 +000011658PyObject *
11659PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011660{
11661 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011663 s = PyUnicode_FromObject(s);
11664 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011665 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 if (sep != NULL) {
11667 sep = PyUnicode_FromObject(sep);
11668 if (sep == NULL) {
11669 Py_DECREF(s);
11670 return NULL;
11671 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011672 }
11673
11674 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11675
11676 Py_DECREF(s);
11677 Py_XDECREF(sep);
11678 return result;
11679}
11680
11681PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011683\n\
11684Return a list of the words in S, using sep as the\n\
11685delimiter string, starting at the end of the string and\n\
11686working to the front. If maxsplit is given, at most maxsplit\n\
11687splits are done. If sep is not specified, any whitespace string\n\
11688is a separator.");
11689
11690static PyObject*
11691unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11692{
11693 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011694 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011695
Martin v. Löwis18e16552006-02-15 17:27:45 +000011696 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011697 return NULL;
11698
11699 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011701 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011703 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011705}
11706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011707PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709\n\
11710Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011711Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011712is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
11714static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011715unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011717 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011718 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011720 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11721 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 return NULL;
11723
Guido van Rossum86662912000-04-11 15:38:46 +000011724 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725}
11726
11727static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011728PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Walter Dörwald346737f2007-05-31 10:44:43 +000011730 if (PyUnicode_CheckExact(self)) {
11731 Py_INCREF(self);
11732 return self;
11733 } else
11734 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011735 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011738PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740\n\
11741Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011745unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 return fixup(self, fixswapcase);
11748}
11749
Georg Brandlceee0772007-11-27 23:48:05 +000011750PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011752\n\
11753Return a translation table usable for str.translate().\n\
11754If there is only one argument, it must be a dictionary mapping Unicode\n\
11755ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011756Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011757If there are two arguments, they must be strings of equal length, and\n\
11758in the resulting dictionary, each character in x will be mapped to the\n\
11759character at the same position in y. If there is a third argument, it\n\
11760must be a string, whose characters will be mapped to None in the result.");
11761
11762static PyObject*
11763unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11764{
11765 PyObject *x, *y = NULL, *z = NULL;
11766 PyObject *new = NULL, *key, *value;
11767 Py_ssize_t i = 0;
11768 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011769
Georg Brandlceee0772007-11-27 23:48:05 +000011770 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11771 return NULL;
11772 new = PyDict_New();
11773 if (!new)
11774 return NULL;
11775 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 int x_kind, y_kind, z_kind;
11777 void *x_data, *y_data, *z_data;
11778
Georg Brandlceee0772007-11-27 23:48:05 +000011779 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011780 if (!PyUnicode_Check(x)) {
11781 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11782 "be a string if there is a second argument");
11783 goto err;
11784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011786 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11787 "arguments must have equal length");
11788 goto err;
11789 }
11790 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 x_kind = PyUnicode_KIND(x);
11792 y_kind = PyUnicode_KIND(y);
11793 x_data = PyUnicode_DATA(x);
11794 y_data = PyUnicode_DATA(y);
11795 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11796 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11797 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011798 if (!key || !value)
11799 goto err;
11800 res = PyDict_SetItem(new, key, value);
11801 Py_DECREF(key);
11802 Py_DECREF(value);
11803 if (res < 0)
11804 goto err;
11805 }
11806 /* create entries for deleting chars in z */
11807 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 z_kind = PyUnicode_KIND(z);
11809 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011810 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011812 if (!key)
11813 goto err;
11814 res = PyDict_SetItem(new, key, Py_None);
11815 Py_DECREF(key);
11816 if (res < 0)
11817 goto err;
11818 }
11819 }
11820 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 int kind;
11822 void *data;
11823
Georg Brandlceee0772007-11-27 23:48:05 +000011824 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011825 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011826 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11827 "to maketrans it must be a dict");
11828 goto err;
11829 }
11830 /* copy entries into the new dict, converting string keys to int keys */
11831 while (PyDict_Next(x, &i, &key, &value)) {
11832 if (PyUnicode_Check(key)) {
11833 /* convert string keys to integer keys */
11834 PyObject *newkey;
11835 if (PyUnicode_GET_SIZE(key) != 1) {
11836 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11837 "table must be of length 1");
11838 goto err;
11839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 kind = PyUnicode_KIND(key);
11841 data = PyUnicode_DATA(key);
11842 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011843 if (!newkey)
11844 goto err;
11845 res = PyDict_SetItem(new, newkey, value);
11846 Py_DECREF(newkey);
11847 if (res < 0)
11848 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011849 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011850 /* just keep integer keys */
11851 if (PyDict_SetItem(new, key, value) < 0)
11852 goto err;
11853 } else {
11854 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11855 "be strings or integers");
11856 goto err;
11857 }
11858 }
11859 }
11860 return new;
11861 err:
11862 Py_DECREF(new);
11863 return NULL;
11864}
11865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011866PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868\n\
11869Return a copy of the string S, where all characters have been mapped\n\
11870through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011871Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011872Unmapped characters are left untouched. Characters mapped to None\n\
11873are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
11875static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879}
11880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011881PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011884Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011887unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 return fixup(self, fixupper);
11890}
11891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011895Pad a numeric string S with zeros on the left, to fill a field\n\
11896of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
11898static PyObject *
11899unicode_zfill(PyUnicodeObject *self, PyObject *args)
11900{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011901 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011903 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 int kind;
11905 void *data;
11906 Py_UCS4 chr;
11907
11908 if (PyUnicode_READY(self) == -1)
11909 return NULL;
11910
Martin v. Löwis18e16552006-02-15 17:27:45 +000011911 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 return NULL;
11913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011915 if (PyUnicode_CheckExact(self)) {
11916 Py_INCREF(self);
11917 return (PyObject*) self;
11918 }
11919 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011920 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 }
11922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924
11925 u = pad(self, fill, 0, '0');
11926
Walter Dörwald068325e2002-04-15 13:36:47 +000011927 if (u == NULL)
11928 return NULL;
11929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 kind = PyUnicode_KIND(u);
11931 data = PyUnicode_DATA(u);
11932 chr = PyUnicode_READ(kind, data, fill);
11933
11934 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 PyUnicode_WRITE(kind, data, 0, chr);
11937 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 }
11939
11940 return (PyObject*) u;
11941}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011944static PyObject *
11945unicode__decimal2ascii(PyObject *self)
11946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011948}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949#endif
11950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011951PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011954Return True if S starts with the specified prefix, False otherwise.\n\
11955With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011956With optional end, stop comparing S at that position.\n\
11957prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959static PyObject *
11960unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011963 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011965 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011966 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011967 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Jesus Ceaac451502011-04-20 17:09:23 +020011969 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011971 if (PyTuple_Check(subobj)) {
11972 Py_ssize_t i;
11973 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11974 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011976 if (substring == NULL)
11977 return NULL;
11978 result = tailmatch(self, substring, start, end, -1);
11979 Py_DECREF(substring);
11980 if (result) {
11981 Py_RETURN_TRUE;
11982 }
11983 }
11984 /* nothing matched */
11985 Py_RETURN_FALSE;
11986 }
11987 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011988 if (substring == NULL) {
11989 if (PyErr_ExceptionMatches(PyExc_TypeError))
11990 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11991 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011993 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011994 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011996 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
11999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012000PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012003Return True if S ends with the specified suffix, False otherwise.\n\
12004With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012005With optional end, stop comparing S at that position.\n\
12006suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008static PyObject *
12009unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012012 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012014 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012015 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012016 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017
Jesus Ceaac451502011-04-20 17:09:23 +020012018 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012020 if (PyTuple_Check(subobj)) {
12021 Py_ssize_t i;
12022 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12023 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012025 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012027 result = tailmatch(self, substring, start, end, +1);
12028 Py_DECREF(substring);
12029 if (result) {
12030 Py_RETURN_TRUE;
12031 }
12032 }
12033 Py_RETURN_FALSE;
12034 }
12035 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012036 if (substring == NULL) {
12037 if (PyErr_ExceptionMatches(PyExc_TypeError))
12038 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12039 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012041 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012042 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012044 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045}
12046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012048
12049PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012051\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012052Return a formatted version of S, using substitutions from args and kwargs.\n\
12053The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012054
Eric Smith27bbca62010-11-04 17:06:58 +000012055PyDoc_STRVAR(format_map__doc__,
12056 "S.format_map(mapping) -> str\n\
12057\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012058Return a formatted version of S, using substitutions from mapping.\n\
12059The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012060
Eric Smith4a7d76d2008-05-30 18:10:19 +000012061static PyObject *
12062unicode__format__(PyObject* self, PyObject* args)
12063{
12064 PyObject *format_spec;
12065
12066 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12067 return NULL;
12068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12070 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012071}
12072
Eric Smith8c663262007-08-25 02:26:07 +000012073PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012075\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012076Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012077
12078static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012079unicode__sizeof__(PyUnicodeObject *v)
12080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 Py_ssize_t size;
12082
12083 /* If it's a compact object, account for base structure +
12084 character data. */
12085 if (PyUnicode_IS_COMPACT_ASCII(v))
12086 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12087 else if (PyUnicode_IS_COMPACT(v))
12088 size = sizeof(PyCompactUnicodeObject) +
12089 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12090 else {
12091 /* If it is a two-block object, account for base object, and
12092 for character block if present. */
12093 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012094 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 size += (PyUnicode_GET_LENGTH(v) + 1) *
12096 PyUnicode_CHARACTER_SIZE(v);
12097 }
12098 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012099 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012100 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012102 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012103 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104
12105 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012106}
12107
12108PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012110
12111static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012112unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012113{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012114 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (!copy)
12116 return NULL;
12117 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012118}
12119
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120static PyMethodDef unicode_methods[] = {
12121
12122 /* Order is according to common usage: often used methods should
12123 appear first, since lookup is done sequentially. */
12124
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012125 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012126 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12127 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012128 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012129 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12130 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12131 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12132 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12133 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12134 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12135 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012136 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012137 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12138 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12139 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012141 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12142 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12143 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012145 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012146 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012148 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12149 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12150 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12151 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12152 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12153 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12154 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12155 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12156 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12157 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12158 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12159 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12160 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12161 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012162 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012163 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012164 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012165 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012166 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012167 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012168 {"maketrans", (PyCFunction) unicode_maketrans,
12169 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012170 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012171#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012172 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173#endif
12174
12175#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012176 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012177 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178#endif
12179
Benjamin Peterson14339b62009-01-31 16:36:08 +000012180 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 {NULL, NULL}
12182};
12183
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012184static PyObject *
12185unicode_mod(PyObject *v, PyObject *w)
12186{
Brian Curtindfc80e32011-08-10 20:28:54 -050012187 if (!PyUnicode_Check(v))
12188 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012189 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012190}
12191
12192static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012193 0, /*nb_add*/
12194 0, /*nb_subtract*/
12195 0, /*nb_multiply*/
12196 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012197};
12198
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 (lenfunc) unicode_length, /* sq_length */
12201 PyUnicode_Concat, /* sq_concat */
12202 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12203 (ssizeargfunc) unicode_getitem, /* sq_item */
12204 0, /* sq_slice */
12205 0, /* sq_ass_item */
12206 0, /* sq_ass_slice */
12207 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208};
12209
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012210static PyObject*
12211unicode_subscript(PyUnicodeObject* self, PyObject* item)
12212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012216 if (PyIndex_Check(item)) {
12217 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012218 if (i == -1 && PyErr_Occurred())
12219 return NULL;
12220 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012222 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012223 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012224 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012226 Py_UNICODE* result_buf;
12227 PyObject* result;
12228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012231 return NULL;
12232 }
12233
12234 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 return PyUnicode_New(0, 0);
12236 } else if (start == 0 && step == 1 &&
12237 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012238 PyUnicode_CheckExact(self)) {
12239 Py_INCREF(self);
12240 return (PyObject *)self;
12241 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012242 return PyUnicode_Substring((PyObject*)self,
12243 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012244 } else {
12245 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012246 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12247 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012248
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 if (result_buf == NULL)
12250 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012251
12252 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12253 result_buf[i] = source_buf[cur];
12254 }
Tim Petersced69f82003-09-16 20:30:58 +000012255
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012256 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012257 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012258 return result;
12259 }
12260 } else {
12261 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12262 return NULL;
12263 }
12264}
12265
12266static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 (lenfunc)unicode_length, /* mp_length */
12268 (binaryfunc)unicode_subscript, /* mp_subscript */
12269 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012270};
12271
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273/* Helpers for PyUnicode_Format() */
12274
12275static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012276getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 (*p_argidx)++;
12281 if (arglen < 0)
12282 return args;
12283 else
12284 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 }
12286 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 return NULL;
12289}
12290
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012291/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012293static PyObject *
12294formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012296 char *p;
12297 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012299
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 x = PyFloat_AsDouble(v);
12301 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012302 return NULL;
12303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012306
Eric Smith0923d1d2009-04-16 20:16:10 +000012307 p = PyOS_double_to_string(x, type, prec,
12308 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012309 if (p == NULL)
12310 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012312 PyMem_Free(p);
12313 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Tim Peters38fd5b62000-09-21 05:43:11 +000012316static PyObject*
12317formatlong(PyObject *val, int flags, int prec, int type)
12318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 char *buf;
12320 int len;
12321 PyObject *str; /* temporary string object. */
12322 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012323
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12325 if (!str)
12326 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 Py_DECREF(str);
12329 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012330}
12331
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012334 size_t buflen,
12335 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012337 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012338 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 if (PyUnicode_GET_LENGTH(v) == 1) {
12340 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 buf[1] = '\0';
12342 return 1;
12343 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 goto onError;
12345 }
12346 else {
12347 /* Integer input truncated to a character */
12348 long x;
12349 x = PyLong_AsLong(v);
12350 if (x == -1 && PyErr_Occurred())
12351 goto onError;
12352
12353 if (x < 0 || x > 0x10ffff) {
12354 PyErr_SetString(PyExc_OverflowError,
12355 "%c arg not in range(0x110000)");
12356 return -1;
12357 }
12358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012360 buf[1] = '\0';
12361 return 1;
12362 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012363
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012365 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012367 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368}
12369
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012370/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012371 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012372*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012373#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012374
Alexander Belopolsky40018472011-02-26 01:02:56 +000012375PyObject *
12376PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 void *fmt;
12379 int fmtkind;
12380 PyObject *result;
12381 Py_UCS4 *res, *res0;
12382 Py_UCS4 max;
12383 int kind;
12384 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012388
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 PyErr_BadInternalCall();
12391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12394 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 fmt = PyUnicode_DATA(uformat);
12397 fmtkind = PyUnicode_KIND(uformat);
12398 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12399 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
12401 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12403 if (res0 == NULL) {
12404 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407
12408 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 arglen = PyTuple_Size(args);
12410 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411 }
12412 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 arglen = -1;
12414 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012416 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012417 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419
12420 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 if (--rescnt < 0) {
12423 rescnt = fmtcnt + 100;
12424 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12426 if (res0 == NULL){
12427 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 }
12430 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 }
12435 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 /* Got a format specifier */
12437 int flags = 0;
12438 Py_ssize_t width = -1;
12439 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 Py_UCS4 c = '\0';
12441 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 int isnumok;
12443 PyObject *v = NULL;
12444 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 void *pbuf;
12446 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 Py_ssize_t len, len1;
12449 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 fmtpos++;
12452 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12453 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 Py_ssize_t keylen;
12455 PyObject *key;
12456 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012457
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 if (dict == NULL) {
12459 PyErr_SetString(PyExc_TypeError,
12460 "format requires a mapping");
12461 goto onError;
12462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 /* Skip over balanced parentheses */
12467 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 if (fmtcnt < 0 || pcount > 0) {
12476 PyErr_SetString(PyExc_ValueError,
12477 "incomplete format key");
12478 goto onError;
12479 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012480 key = PyUnicode_Substring((PyObject*)uformat,
12481 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 if (key == NULL)
12483 goto onError;
12484 if (args_owned) {
12485 Py_DECREF(args);
12486 args_owned = 0;
12487 }
12488 args = PyObject_GetItem(dict, key);
12489 Py_DECREF(key);
12490 if (args == NULL) {
12491 goto onError;
12492 }
12493 args_owned = 1;
12494 arglen = -1;
12495 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 case '-': flags |= F_LJUST; continue;
12500 case '+': flags |= F_SIGN; continue;
12501 case ' ': flags |= F_BLANK; continue;
12502 case '#': flags |= F_ALT; continue;
12503 case '0': flags |= F_ZERO; continue;
12504 }
12505 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012506 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 if (c == '*') {
12508 v = getnextarg(args, arglen, &argidx);
12509 if (v == NULL)
12510 goto onError;
12511 if (!PyLong_Check(v)) {
12512 PyErr_SetString(PyExc_TypeError,
12513 "* wants int");
12514 goto onError;
12515 }
12516 width = PyLong_AsLong(v);
12517 if (width == -1 && PyErr_Occurred())
12518 goto onError;
12519 if (width < 0) {
12520 flags |= F_LJUST;
12521 width = -width;
12522 }
12523 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 }
12526 else if (c >= '0' && c <= '9') {
12527 width = c - '0';
12528 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 if (c < '0' || c > '9')
12531 break;
12532 if ((width*10) / 10 != width) {
12533 PyErr_SetString(PyExc_ValueError,
12534 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 }
12537 width = width*10 + (c - '0');
12538 }
12539 }
12540 if (c == '.') {
12541 prec = 0;
12542 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 if (c == '*') {
12545 v = getnextarg(args, arglen, &argidx);
12546 if (v == NULL)
12547 goto onError;
12548 if (!PyLong_Check(v)) {
12549 PyErr_SetString(PyExc_TypeError,
12550 "* wants int");
12551 goto onError;
12552 }
12553 prec = PyLong_AsLong(v);
12554 if (prec == -1 && PyErr_Occurred())
12555 goto onError;
12556 if (prec < 0)
12557 prec = 0;
12558 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 }
12561 else if (c >= '0' && c <= '9') {
12562 prec = c - '0';
12563 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 if (c < '0' || c > '9')
12566 break;
12567 if ((prec*10) / 10 != prec) {
12568 PyErr_SetString(PyExc_ValueError,
12569 "prec too big");
12570 goto onError;
12571 }
12572 prec = prec*10 + (c - '0');
12573 }
12574 }
12575 } /* prec */
12576 if (fmtcnt >= 0) {
12577 if (c == 'h' || c == 'l' || c == 'L') {
12578 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 }
12581 }
12582 if (fmtcnt < 0) {
12583 PyErr_SetString(PyExc_ValueError,
12584 "incomplete format");
12585 goto onError;
12586 }
12587 if (c != '%') {
12588 v = getnextarg(args, arglen, &argidx);
12589 if (v == NULL)
12590 goto onError;
12591 }
12592 sign = 0;
12593 fill = ' ';
12594 switch (c) {
12595
12596 case '%':
12597 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 len = 1;
12602 break;
12603
12604 case 's':
12605 case 'r':
12606 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012607 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 temp = v;
12609 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012610 }
12611 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 if (c == 's')
12613 temp = PyObject_Str(v);
12614 else if (c == 'r')
12615 temp = PyObject_Repr(v);
12616 else
12617 temp = PyObject_ASCII(v);
12618 if (temp == NULL)
12619 goto onError;
12620 if (PyUnicode_Check(temp))
12621 /* nothing to do */;
12622 else {
12623 Py_DECREF(temp);
12624 PyErr_SetString(PyExc_TypeError,
12625 "%s argument has non-string str()");
12626 goto onError;
12627 }
12628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 if (PyUnicode_READY(temp) == -1) {
12630 Py_CLEAR(temp);
12631 goto onError;
12632 }
12633 pbuf = PyUnicode_DATA(temp);
12634 kind = PyUnicode_KIND(temp);
12635 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 if (prec >= 0 && len > prec)
12637 len = prec;
12638 break;
12639
12640 case 'i':
12641 case 'd':
12642 case 'u':
12643 case 'o':
12644 case 'x':
12645 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 isnumok = 0;
12647 if (PyNumber_Check(v)) {
12648 PyObject *iobj=NULL;
12649
12650 if (PyLong_Check(v)) {
12651 iobj = v;
12652 Py_INCREF(iobj);
12653 }
12654 else {
12655 iobj = PyNumber_Long(v);
12656 }
12657 if (iobj!=NULL) {
12658 if (PyLong_Check(iobj)) {
12659 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012660 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 Py_DECREF(iobj);
12662 if (!temp)
12663 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 if (PyUnicode_READY(temp) == -1) {
12665 Py_CLEAR(temp);
12666 goto onError;
12667 }
12668 pbuf = PyUnicode_DATA(temp);
12669 kind = PyUnicode_KIND(temp);
12670 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 sign = 1;
12672 }
12673 else {
12674 Py_DECREF(iobj);
12675 }
12676 }
12677 }
12678 if (!isnumok) {
12679 PyErr_Format(PyExc_TypeError,
12680 "%%%c format: a number is required, "
12681 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12682 goto onError;
12683 }
12684 if (flags & F_ZERO)
12685 fill = '0';
12686 break;
12687
12688 case 'e':
12689 case 'E':
12690 case 'f':
12691 case 'F':
12692 case 'g':
12693 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012694 temp = formatfloat(v, flags, prec, c);
12695 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 if (PyUnicode_READY(temp) == -1) {
12698 Py_CLEAR(temp);
12699 goto onError;
12700 }
12701 pbuf = PyUnicode_DATA(temp);
12702 kind = PyUnicode_KIND(temp);
12703 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 sign = 1;
12705 if (flags & F_ZERO)
12706 fill = '0';
12707 break;
12708
12709 case 'c':
12710 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012712 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 if (len < 0)
12714 goto onError;
12715 break;
12716
12717 default:
12718 PyErr_Format(PyExc_ValueError,
12719 "unsupported format character '%c' (0x%x) "
12720 "at index %zd",
12721 (31<=c && c<=126) ? (char)c : '?',
12722 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 goto onError;
12725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 /* pbuf is initialized here. */
12727 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12730 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12731 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 len--;
12733 }
12734 else if (flags & F_SIGN)
12735 sign = '+';
12736 else if (flags & F_BLANK)
12737 sign = ' ';
12738 else
12739 sign = 0;
12740 }
12741 if (width < len)
12742 width = len;
12743 if (rescnt - (sign != 0) < width) {
12744 reslen -= rescnt;
12745 rescnt = width + fmtcnt + 100;
12746 reslen += rescnt;
12747 if (reslen < 0) {
12748 Py_XDECREF(temp);
12749 PyErr_NoMemory();
12750 goto onError;
12751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12753 if (res0 == 0) {
12754 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 Py_XDECREF(temp);
12756 goto onError;
12757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 }
12760 if (sign) {
12761 if (fill != ' ')
12762 *res++ = sign;
12763 rescnt--;
12764 if (width > len)
12765 width--;
12766 }
12767 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12769 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12772 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 }
12774 rescnt -= 2;
12775 width -= 2;
12776 if (width < 0)
12777 width = 0;
12778 len -= 2;
12779 }
12780 if (width > len && !(flags & F_LJUST)) {
12781 do {
12782 --rescnt;
12783 *res++ = fill;
12784 } while (--width > len);
12785 }
12786 if (fill == ' ') {
12787 if (sign)
12788 *res++ = sign;
12789 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12791 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12792 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12793 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012794 }
12795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 /* Copy all characters, preserving len */
12797 len1 = len;
12798 while (len1--) {
12799 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12800 rescnt--;
12801 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 while (--width >= len) {
12803 --rescnt;
12804 *res++ = ' ';
12805 }
12806 if (dict && (argidx < arglen) && c != '%') {
12807 PyErr_SetString(PyExc_TypeError,
12808 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012809 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 goto onError;
12811 }
12812 Py_XDECREF(temp);
12813 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 } /* until end */
12815 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012816 PyErr_SetString(PyExc_TypeError,
12817 "not all arguments converted during string formatting");
12818 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819 }
12820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821
12822 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12823 if (*res > max)
12824 max = *res;
12825 result = PyUnicode_New(reslen - rescnt, max);
12826 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 kind = PyUnicode_KIND(result);
12829 for (res = res0; res < res0+reslen-rescnt; res++)
12830 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12831 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834 }
12835 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836 return (PyObject *)result;
12837
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840 Py_DECREF(uformat);
12841 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843 }
12844 return NULL;
12845}
12846
Jeremy Hylton938ace62002-07-17 16:30:39 +000012847static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012848unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12849
Tim Peters6d6c1a32001-08-02 04:15:00 +000012850static PyObject *
12851unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12852{
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 static char *kwlist[] = {"object", "encoding", "errors", 0};
12855 char *encoding = NULL;
12856 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012857
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 if (type != &PyUnicode_Type)
12859 return unicode_subtype_new(type, args, kwds);
12860 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 return NULL;
12863 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 if (encoding == NULL && errors == NULL)
12866 return PyObject_Str(x);
12867 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012869}
12870
Guido van Rossume023fe02001-08-30 03:12:59 +000012871static PyObject *
12872unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12873{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012874 PyUnicodeObject *unicode, *self;
12875 Py_ssize_t length, char_size;
12876 int share_wstr, share_utf8;
12877 unsigned int kind;
12878 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012879
Benjamin Peterson14339b62009-01-31 16:36:08 +000012880 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012881
12882 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12883 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012884 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012885 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012886 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012887 return NULL;
12888
12889 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12890 if (self == NULL) {
12891 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012892 return NULL;
12893 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012894 kind = PyUnicode_KIND(unicode);
12895 length = PyUnicode_GET_LENGTH(unicode);
12896
12897 _PyUnicode_LENGTH(self) = length;
12898 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12899 _PyUnicode_STATE(self).interned = 0;
12900 _PyUnicode_STATE(self).kind = kind;
12901 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012902 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012903 _PyUnicode_STATE(self).ready = 1;
12904 _PyUnicode_WSTR(self) = NULL;
12905 _PyUnicode_UTF8_LENGTH(self) = 0;
12906 _PyUnicode_UTF8(self) = NULL;
12907 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012908 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012909
12910 share_utf8 = 0;
12911 share_wstr = 0;
12912 if (kind == PyUnicode_1BYTE_KIND) {
12913 char_size = 1;
12914 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12915 share_utf8 = 1;
12916 }
12917 else if (kind == PyUnicode_2BYTE_KIND) {
12918 char_size = 2;
12919 if (sizeof(wchar_t) == 2)
12920 share_wstr = 1;
12921 }
12922 else {
12923 assert(kind == PyUnicode_4BYTE_KIND);
12924 char_size = 4;
12925 if (sizeof(wchar_t) == 4)
12926 share_wstr = 1;
12927 }
12928
12929 /* Ensure we won't overflow the length. */
12930 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12931 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012933 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012934 data = PyObject_MALLOC((length + 1) * char_size);
12935 if (data == NULL) {
12936 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 goto onError;
12938 }
12939
Victor Stinnerc3c74152011-10-02 20:39:55 +020012940 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012941 if (share_utf8) {
12942 _PyUnicode_UTF8_LENGTH(self) = length;
12943 _PyUnicode_UTF8(self) = data;
12944 }
12945 if (share_wstr) {
12946 _PyUnicode_WSTR_LENGTH(self) = length;
12947 _PyUnicode_WSTR(self) = (wchar_t *)data;
12948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012950 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12951 PyUnicode_KIND_SIZE(kind, length + 1));
12952 Py_DECREF(unicode);
12953 return (PyObject *)self;
12954
12955onError:
12956 Py_DECREF(unicode);
12957 Py_DECREF(self);
12958 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012959}
12960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012961PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012962 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012963\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012964Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012965encoding defaults to the current default string encoding.\n\
12966errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012967
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012968static PyObject *unicode_iter(PyObject *seq);
12969
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012971 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012972 "str", /* tp_name */
12973 sizeof(PyUnicodeObject), /* tp_size */
12974 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 (destructor)unicode_dealloc, /* tp_dealloc */
12977 0, /* tp_print */
12978 0, /* tp_getattr */
12979 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012980 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981 unicode_repr, /* tp_repr */
12982 &unicode_as_number, /* tp_as_number */
12983 &unicode_as_sequence, /* tp_as_sequence */
12984 &unicode_as_mapping, /* tp_as_mapping */
12985 (hashfunc) unicode_hash, /* tp_hash*/
12986 0, /* tp_call*/
12987 (reprfunc) unicode_str, /* tp_str */
12988 PyObject_GenericGetAttr, /* tp_getattro */
12989 0, /* tp_setattro */
12990 0, /* tp_as_buffer */
12991 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012993 unicode_doc, /* tp_doc */
12994 0, /* tp_traverse */
12995 0, /* tp_clear */
12996 PyUnicode_RichCompare, /* tp_richcompare */
12997 0, /* tp_weaklistoffset */
12998 unicode_iter, /* tp_iter */
12999 0, /* tp_iternext */
13000 unicode_methods, /* tp_methods */
13001 0, /* tp_members */
13002 0, /* tp_getset */
13003 &PyBaseObject_Type, /* tp_base */
13004 0, /* tp_dict */
13005 0, /* tp_descr_get */
13006 0, /* tp_descr_set */
13007 0, /* tp_dictoffset */
13008 0, /* tp_init */
13009 0, /* tp_alloc */
13010 unicode_new, /* tp_new */
13011 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012};
13013
13014/* Initialize the Unicode implementation */
13015
Thomas Wouters78890102000-07-22 19:25:51 +000013016void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013018 int i;
13019
Thomas Wouters477c8d52006-05-27 19:21:47 +000013020 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013022 0x000A, /* LINE FEED */
13023 0x000D, /* CARRIAGE RETURN */
13024 0x001C, /* FILE SEPARATOR */
13025 0x001D, /* GROUP SEPARATOR */
13026 0x001E, /* RECORD SEPARATOR */
13027 0x0085, /* NEXT LINE */
13028 0x2028, /* LINE SEPARATOR */
13029 0x2029, /* PARAGRAPH SEPARATOR */
13030 };
13031
Fred Drakee4315f52000-05-09 19:53:39 +000013032 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013033 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013034 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013036
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013037 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013038 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013039 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041
13042 /* initialize the linebreak bloom filter */
13043 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013045 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013046
13047 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048}
13049
13050/* Finalize the Unicode implementation */
13051
Christian Heimesa156e092008-02-16 07:38:31 +000013052int
13053PyUnicode_ClearFreeList(void)
13054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013056}
13057
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058void
Thomas Wouters78890102000-07-22 19:25:51 +000013059_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013061 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013063 Py_XDECREF(unicode_empty);
13064 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013065
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013066 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 if (unicode_latin1[i]) {
13068 Py_DECREF(unicode_latin1[i]);
13069 unicode_latin1[i] = NULL;
13070 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013071 }
Christian Heimesa156e092008-02-16 07:38:31 +000013072 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013074
Walter Dörwald16807132007-05-25 13:52:07 +000013075void
13076PyUnicode_InternInPlace(PyObject **p)
13077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013078 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13079 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013080#ifdef Py_DEBUG
13081 assert(s != NULL);
13082 assert(_PyUnicode_CHECK(s));
13083#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013084 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013085 return;
13086#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013087 /* If it's a subclass, we don't really know what putting
13088 it in the interned dict might do. */
13089 if (!PyUnicode_CheckExact(s))
13090 return;
13091 if (PyUnicode_CHECK_INTERNED(s))
13092 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013093 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013094 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 return;
13096 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013097 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013098 if (interned == NULL) {
13099 interned = PyDict_New();
13100 if (interned == NULL) {
13101 PyErr_Clear(); /* Don't leave an exception */
13102 return;
13103 }
13104 }
13105 /* It might be that the GetItem call fails even
13106 though the key is present in the dictionary,
13107 namely when this happens during a stack overflow. */
13108 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013110 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013111
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 if (t) {
13113 Py_INCREF(t);
13114 Py_DECREF(*p);
13115 *p = t;
13116 return;
13117 }
Walter Dörwald16807132007-05-25 13:52:07 +000013118
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119 PyThreadState_GET()->recursion_critical = 1;
13120 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13121 PyErr_Clear();
13122 PyThreadState_GET()->recursion_critical = 0;
13123 return;
13124 }
13125 PyThreadState_GET()->recursion_critical = 0;
13126 /* The two references in interned are not counted by refcnt.
13127 The deallocator will take care of this */
13128 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013130}
13131
13132void
13133PyUnicode_InternImmortal(PyObject **p)
13134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13136
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 PyUnicode_InternInPlace(p);
13138 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 Py_INCREF(*p);
13141 }
Walter Dörwald16807132007-05-25 13:52:07 +000013142}
13143
13144PyObject *
13145PyUnicode_InternFromString(const char *cp)
13146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013147 PyObject *s = PyUnicode_FromString(cp);
13148 if (s == NULL)
13149 return NULL;
13150 PyUnicode_InternInPlace(&s);
13151 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013152}
13153
Alexander Belopolsky40018472011-02-26 01:02:56 +000013154void
13155_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 PyObject *keys;
13158 PyUnicodeObject *s;
13159 Py_ssize_t i, n;
13160 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013161
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 if (interned == NULL || !PyDict_Check(interned))
13163 return;
13164 keys = PyDict_Keys(interned);
13165 if (keys == NULL || !PyList_Check(keys)) {
13166 PyErr_Clear();
13167 return;
13168 }
Walter Dörwald16807132007-05-25 13:52:07 +000013169
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13171 detector, interned unicode strings are not forcibly deallocated;
13172 rather, we give them their stolen references back, and then clear
13173 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013174
Benjamin Peterson14339b62009-01-31 16:36:08 +000013175 n = PyList_GET_SIZE(keys);
13176 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013178 for (i = 0; i < n; i++) {
13179 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 if (PyUnicode_READY(s) == -1)
13181 fprintf(stderr, "could not ready string\n");
13182 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183 case SSTATE_NOT_INTERNED:
13184 /* XXX Shouldn't happen */
13185 break;
13186 case SSTATE_INTERNED_IMMORTAL:
13187 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013189 break;
13190 case SSTATE_INTERNED_MORTAL:
13191 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 break;
13194 default:
13195 Py_FatalError("Inconsistent interned string state.");
13196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 }
13199 fprintf(stderr, "total size of all interned strings: "
13200 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13201 "mortal/immortal\n", mortal_size, immortal_size);
13202 Py_DECREF(keys);
13203 PyDict_Clear(interned);
13204 Py_DECREF(interned);
13205 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013206}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013207
13208
13209/********************* Unicode Iterator **************************/
13210
13211typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013212 PyObject_HEAD
13213 Py_ssize_t it_index;
13214 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013215} unicodeiterobject;
13216
13217static void
13218unicodeiter_dealloc(unicodeiterobject *it)
13219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 _PyObject_GC_UNTRACK(it);
13221 Py_XDECREF(it->it_seq);
13222 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013223}
13224
13225static int
13226unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13227{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013228 Py_VISIT(it->it_seq);
13229 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013230}
13231
13232static PyObject *
13233unicodeiter_next(unicodeiterobject *it)
13234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013235 PyUnicodeObject *seq;
13236 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013237
Benjamin Peterson14339b62009-01-31 16:36:08 +000013238 assert(it != NULL);
13239 seq = it->it_seq;
13240 if (seq == NULL)
13241 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013242 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013244 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13245 int kind = PyUnicode_KIND(seq);
13246 void *data = PyUnicode_DATA(seq);
13247 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13248 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 if (item != NULL)
13250 ++it->it_index;
13251 return item;
13252 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013253
Benjamin Peterson14339b62009-01-31 16:36:08 +000013254 Py_DECREF(seq);
13255 it->it_seq = NULL;
13256 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013257}
13258
13259static PyObject *
13260unicodeiter_len(unicodeiterobject *it)
13261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013262 Py_ssize_t len = 0;
13263 if (it->it_seq)
13264 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13265 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013266}
13267
13268PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13269
13270static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013274};
13275
13276PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13278 "str_iterator", /* tp_name */
13279 sizeof(unicodeiterobject), /* tp_basicsize */
13280 0, /* tp_itemsize */
13281 /* methods */
13282 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13283 0, /* tp_print */
13284 0, /* tp_getattr */
13285 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013286 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013287 0, /* tp_repr */
13288 0, /* tp_as_number */
13289 0, /* tp_as_sequence */
13290 0, /* tp_as_mapping */
13291 0, /* tp_hash */
13292 0, /* tp_call */
13293 0, /* tp_str */
13294 PyObject_GenericGetAttr, /* tp_getattro */
13295 0, /* tp_setattro */
13296 0, /* tp_as_buffer */
13297 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13298 0, /* tp_doc */
13299 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13300 0, /* tp_clear */
13301 0, /* tp_richcompare */
13302 0, /* tp_weaklistoffset */
13303 PyObject_SelfIter, /* tp_iter */
13304 (iternextfunc)unicodeiter_next, /* tp_iternext */
13305 unicodeiter_methods, /* tp_methods */
13306 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013307};
13308
13309static PyObject *
13310unicode_iter(PyObject *seq)
13311{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013312 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013313
Benjamin Peterson14339b62009-01-31 16:36:08 +000013314 if (!PyUnicode_Check(seq)) {
13315 PyErr_BadInternalCall();
13316 return NULL;
13317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 if (PyUnicode_READY(seq) == -1)
13319 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13321 if (it == NULL)
13322 return NULL;
13323 it->it_index = 0;
13324 Py_INCREF(seq);
13325 it->it_seq = (PyUnicodeObject *)seq;
13326 _PyObject_GC_TRACK(it);
13327 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013328}
13329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330#define UNIOP(x) Py_UNICODE_##x
13331#define UNIOP_t Py_UNICODE
13332#include "uniops.h"
13333#undef UNIOP
13334#undef UNIOP_t
13335#define UNIOP(x) Py_UCS4_##x
13336#define UNIOP_t Py_UCS4
13337#include "uniops.h"
13338#undef UNIOP
13339#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013340
Victor Stinner71133ff2010-09-01 23:43:53 +000013341Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013342PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013343{
13344 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13345 Py_UNICODE *copy;
13346 Py_ssize_t size;
13347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 if (!PyUnicode_Check(unicode)) {
13349 PyErr_BadArgument();
13350 return NULL;
13351 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013352 /* Ensure we won't overflow the size. */
13353 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13354 PyErr_NoMemory();
13355 return NULL;
13356 }
13357 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13358 size *= sizeof(Py_UNICODE);
13359 copy = PyMem_Malloc(size);
13360 if (copy == NULL) {
13361 PyErr_NoMemory();
13362 return NULL;
13363 }
13364 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13365 return copy;
13366}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013367
Georg Brandl66c221e2010-10-14 07:04:07 +000013368/* A _string module, to export formatter_parser and formatter_field_name_split
13369 to the string.Formatter class implemented in Python. */
13370
13371static PyMethodDef _string_methods[] = {
13372 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13373 METH_O, PyDoc_STR("split the argument as a field name")},
13374 {"formatter_parser", (PyCFunction) formatter_parser,
13375 METH_O, PyDoc_STR("parse the argument as a format string")},
13376 {NULL, NULL}
13377};
13378
13379static struct PyModuleDef _string_module = {
13380 PyModuleDef_HEAD_INIT,
13381 "_string",
13382 PyDoc_STR("string helper module"),
13383 0,
13384 _string_methods,
13385 NULL,
13386 NULL,
13387 NULL,
13388 NULL
13389};
13390
13391PyMODINIT_FUNC
13392PyInit__string(void)
13393{
13394 return PyModule_Create(&_string_module);
13395}
13396
13397
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013398#ifdef __cplusplus
13399}
13400#endif