blob: 87d661e8bece45d2a6af4b5930ed257afa603000 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner03490912011-10-03 23:45:12 +0200154/* true if the Unicode object has an allocated wstr memory block
155 (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op) \
157 (assert(_PyUnicode_CHECK(op)), \
158 (_PyUnicode_WSTR(op) && \
159 (!PyUnicode_IS_READY(op) || \
160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
Victor Stinner910337b2011-10-03 03:20:16 +0200162/* Generic helper macro to convert characters of different types.
163 from_type and to_type have to be valid type names, begin and end
164 are pointers to the source characters which should be of type
165 "from_type *". to is a pointer of type "to_type *" and points to the
166 buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168 do { \
169 const from_type *iter_; to_type *to_; \
170 for (iter_ = (begin), to_ = (to_type *)(to); \
171 iter_ < (end); \
172 ++iter_, ++to_) { \
173 *to_ = (to_type)*iter_; \
174 } \
175 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200176
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
Walter Dörwald16807132007-05-25 13:52:07 +0000180/* This dictionary holds all interned unicode strings. Note that references
181 to strings in this dictionary are *not* counted in the string's ob_refcnt.
182 When the interned string reaches a refcnt of 0 the string deallocation
183 function will delete the reference from this dictionary.
184
185 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000187*/
188static PyObject *interned;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200191static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200231
Alexander Belopolsky40018472011-02-26 01:02:56 +0000232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 PyObject **errorHandler,const char *encoding, const char *reason,
235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
Alexander Belopolsky40018472011-02-26 01:02:56 +0000238static void
239raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300240 const char *encoding,
241 const Py_UNICODE *unicode, Py_ssize_t size,
242 Py_ssize_t startpos, Py_ssize_t endpos,
243 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000244
Christian Heimes190d79e2008-01-30 11:58:22 +0000245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000248/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000249/* 0x000B, * LINE TABULATION */
250/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000251/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000252 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x001C, * FILE SEPARATOR */
255/* 0x001D, * GROUP SEPARATOR */
256/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 1, 1, 1, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000271};
272
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000276PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000277{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000278#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 /* This is actually an illegal character, so it should
282 not be passed to unichr. */
283 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#endif
285}
286
Victor Stinner910337b2011-10-03 03:20:16 +0200287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291 PyASCIIObject *ascii;
292 unsigned int kind;
293
294 assert(PyUnicode_Check(op));
295
296 ascii = (PyASCIIObject *)op;
297 kind = ascii->state.kind;
298
Victor Stinnera3b334d2011-10-03 13:53:37 +0200299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200300 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200301 assert(ascii->state.ready == 1);
302 }
303 else if (ascii->state.compact == 1) {
Victor Stinner85041a52011-10-03 14:42:39 +0200304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200305 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND
307 || kind == PyUnicode_2BYTE_KIND
308 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(ascii->state.ascii == 0);
310 assert(ascii->state.ready == 1);
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 data = compact + 1;
312 assert (compact->utf8 != data);
313 if (
314#if SIZEOF_WCHAR_T == 2
315 kind == PyUnicode_2BYTE_KIND
316#else
317 kind == PyUnicode_4BYTE_KIND
318#endif
319 )
320 assert(ascii->wstr == data);
321 else
322 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 } else {
324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
326
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200328 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200330 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->wstr != NULL);
332 assert(unicode->data.any == NULL);
333 assert(compact->utf8 == NULL);
334 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
335 }
336 else {
337 assert(kind == PyUnicode_1BYTE_KIND
338 || kind == PyUnicode_2BYTE_KIND
339 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200340 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 assert(unicode->data.any != NULL);
Victor Stinner85041a52011-10-03 14:42:39 +0200343 if (ascii->state.ascii)
344 assert (compact->utf8 == unicode->data.any);
345 else
346 assert (compact->utf8 != unicode->data.any);
Victor Stinner7f11ad42011-10-04 00:00:20 +0200347 if (
348#if SIZEOF_WCHAR_T == 2
349 kind == PyUnicode_2BYTE_KIND
350#else
351 kind == PyUnicode_4BYTE_KIND
352#endif
353 )
354 assert(ascii->wstr == unicode->data.any);
355 else
356 assert(ascii->wstr != unicode->data.any);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
358 }
359 return 1;
360}
361#endif
362
Thomas Wouters477c8d52006-05-27 19:21:47 +0000363/* --- Bloom Filters ----------------------------------------------------- */
364
365/* stuff to implement simple "bloom filters" for Unicode characters.
366 to keep things simple, we use a single bitmask, using the least 5
367 bits from each unicode characters as the bit index. */
368
369/* the linebreak mask is set up by Unicode_Init below */
370
Antoine Pitrouf068f942010-01-13 14:19:12 +0000371#if LONG_BIT >= 128
372#define BLOOM_WIDTH 128
373#elif LONG_BIT >= 64
374#define BLOOM_WIDTH 64
375#elif LONG_BIT >= 32
376#define BLOOM_WIDTH 32
377#else
378#error "LONG_BIT is smaller than 32"
379#endif
380
Thomas Wouters477c8d52006-05-27 19:21:47 +0000381#define BLOOM_MASK unsigned long
382
383static BLOOM_MASK bloom_linebreak;
384
Antoine Pitrouf068f942010-01-13 14:19:12 +0000385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
386#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000387
Benjamin Peterson29060642009-01-31 22:14:21 +0000388#define BLOOM_LINEBREAK(ch) \
389 ((ch) < 128U ? ascii_linebreak[(ch)] : \
390 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394{
395 /* calculate simple bloom-style bitmask for a given unicode string */
396
Antoine Pitrouf068f942010-01-13 14:19:12 +0000397 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 Py_ssize_t i;
399
400 mask = 0;
401 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000403
404 return mask;
405}
406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200407#define BLOOM_MEMBER(mask, chr, str) \
408 (BLOOM(mask, chr) \
409 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000410
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411/* --- Unicode Object ----------------------------------------------------- */
412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
415
416Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
417 Py_ssize_t size, Py_UCS4 ch,
418 int direction)
419{
420 /* like wcschr, but doesn't stop at NULL characters */
421 Py_ssize_t i;
422 if (direction == 1) {
423 for(i = 0; i < size; i++)
424 if (PyUnicode_READ(kind, s, i) == ch)
425 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
426 }
427 else {
428 for(i = size-1; i >= 0; i--)
429 if (PyUnicode_READ(kind, s, i) == ch)
430 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
431 }
432 return NULL;
433}
434
Victor Stinnerfe226c02011-10-03 03:52:20 +0200435static PyObject*
436resize_compact(PyObject *unicode, Py_ssize_t length)
437{
438 Py_ssize_t char_size;
439 Py_ssize_t struct_size;
440 Py_ssize_t new_size;
441 int share_wstr;
442
443 assert(PyUnicode_IS_READY(unicode));
444 char_size = PyUnicode_CHARACTER_SIZE(unicode);
445 if (PyUnicode_IS_COMPACT_ASCII(unicode))
446 struct_size = sizeof(PyASCIIObject);
447 else
448 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200449 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200450
451 _Py_DEC_REFTOTAL;
452 _Py_ForgetReference(unicode);
453
454 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
455 PyErr_NoMemory();
456 return NULL;
457 }
458 new_size = (struct_size + (length + 1) * char_size);
459
460 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
461 if (unicode == NULL) {
462 PyObject_Del(unicode);
463 PyErr_NoMemory();
464 return NULL;
465 }
466 _Py_NewReference(unicode);
467 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200468 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200469 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200470 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
471 _PyUnicode_WSTR_LENGTH(unicode) = length;
472 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200473 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
474 length, 0);
475 return unicode;
476}
477
Alexander Belopolsky40018472011-02-26 01:02:56 +0000478static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200479resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480{
481 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200483 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200484
Victor Stinnerfe226c02011-10-03 03:52:20 +0200485 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200486 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000487
Victor Stinnerfe226c02011-10-03 03:52:20 +0200488 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
489 {
490 PyObject_DEL(_PyUnicode_UTF8(unicode));
491 _PyUnicode_UTF8(unicode) = NULL;
492 }
493
494 if (PyUnicode_IS_READY(unicode)) {
495 Py_ssize_t char_size;
496 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200497 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498 void *data;
499
500 data = _PyUnicode_DATA_ANY(unicode);
501 assert(data != NULL);
502 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200503 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
504 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200505
506 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
507 PyErr_NoMemory();
508 return -1;
509 }
510 new_size = (length + 1) * char_size;
511
512 data = (PyObject *)PyObject_REALLOC(data, new_size);
513 if (data == NULL) {
514 PyErr_NoMemory();
515 return -1;
516 }
517 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200519 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200520 _PyUnicode_WSTR_LENGTH(unicode) = length;
521 }
522 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200523 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200524 _PyUnicode_UTF8_LENGTH(unicode) = length;
525 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200526 _PyUnicode_LENGTH(unicode) = length;
527 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
528 if (share_wstr)
529 return 0;
530 }
531 if (_PyUnicode_WSTR(unicode) != NULL) {
532 assert(_PyUnicode_WSTR(unicode) != NULL);
533
534 oldstr = _PyUnicode_WSTR(unicode);
535 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
536 sizeof(Py_UNICODE) * (length + 1));
537 if (!_PyUnicode_WSTR(unicode)) {
538 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
539 PyErr_NoMemory();
540 return -1;
541 }
542 _PyUnicode_WSTR(unicode)[length] = 0;
543 _PyUnicode_WSTR_LENGTH(unicode) = length;
544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000545 return 0;
546}
547
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548static PyObject*
549resize_copy(PyObject *unicode, Py_ssize_t length)
550{
551 Py_ssize_t copy_length;
552 if (PyUnicode_IS_COMPACT(unicode)) {
553 PyObject *copy;
554 assert(PyUnicode_IS_READY(unicode));
555
556 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
557 if (copy == NULL)
558 return NULL;
559
560 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
561 if (PyUnicode_CopyCharacters(copy, 0,
562 unicode, 0,
563 copy_length) < 0)
564 {
565 Py_DECREF(copy);
566 return NULL;
567 }
568 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200569 }
570 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200571 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200572 assert(_PyUnicode_WSTR(unicode) != NULL);
573 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200574 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200575 if (w == NULL)
576 return NULL;
577 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
578 copy_length = Py_MIN(copy_length, length);
579 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
580 copy_length);
581 return (PyObject*)w;
582 }
583}
584
Guido van Rossumd57fd912000-03-10 22:53:23 +0000585/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000586 Ux0000 terminated; some code (e.g. new_identifier)
587 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588
589 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000590 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591
592*/
593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594#ifdef Py_DEBUG
595int unicode_old_new_calls = 0;
596#endif
597
Alexander Belopolsky40018472011-02-26 01:02:56 +0000598static PyUnicodeObject *
599_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600{
601 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
Thomas Wouters477c8d52006-05-27 19:21:47 +0000604 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 if (length == 0 && unicode_empty != NULL) {
606 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200607 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 }
609
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000610 /* Ensure we won't overflow the size. */
611 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
612 return (PyUnicodeObject *)PyErr_NoMemory();
613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 if (length < 0) {
615 PyErr_SetString(PyExc_SystemError,
616 "Negative size passed to _PyUnicode_New");
617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618 }
619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620#ifdef Py_DEBUG
621 ++unicode_old_new_calls;
622#endif
623
624 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
625 if (unicode == NULL)
626 return NULL;
627 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
628 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
629 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 PyErr_NoMemory();
631 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633
Jeremy Hyltond8082792003-09-16 19:41:39 +0000634 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000635 * the caller fails before initializing str -- unicode_resize()
636 * reads str[0], and the Keep-Alive optimization can keep memory
637 * allocated for str alive across a call to unicode_dealloc(unicode).
638 * We don't want unicode_resize to read uninitialized memory in
639 * that case.
640 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641 _PyUnicode_WSTR(unicode)[0] = 0;
642 _PyUnicode_WSTR(unicode)[length] = 0;
643 _PyUnicode_WSTR_LENGTH(unicode) = length;
644 _PyUnicode_HASH(unicode) = -1;
645 _PyUnicode_STATE(unicode).interned = 0;
646 _PyUnicode_STATE(unicode).kind = 0;
647 _PyUnicode_STATE(unicode).compact = 0;
648 _PyUnicode_STATE(unicode).ready = 0;
649 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200650 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200652 _PyUnicode_UTF8(unicode) = NULL;
653 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000655
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000657 /* XXX UNREF/NEWREF interface should be more symmetrical */
658 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000659 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000660 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662}
663
Victor Stinnerf42dc442011-10-02 23:33:16 +0200664static const char*
665unicode_kind_name(PyObject *unicode)
666{
Victor Stinner42dfd712011-10-03 14:41:45 +0200667 /* don't check consistency: unicode_kind_name() is called from
668 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200669 if (!PyUnicode_IS_COMPACT(unicode))
670 {
671 if (!PyUnicode_IS_READY(unicode))
672 return "wstr";
673 switch(PyUnicode_KIND(unicode))
674 {
675 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200676 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200677 return "legacy ascii";
678 else
679 return "legacy latin1";
680 case PyUnicode_2BYTE_KIND:
681 return "legacy UCS2";
682 case PyUnicode_4BYTE_KIND:
683 return "legacy UCS4";
684 default:
685 return "<legacy invalid kind>";
686 }
687 }
688 assert(PyUnicode_IS_READY(unicode));
689 switch(PyUnicode_KIND(unicode))
690 {
691 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200692 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200693 return "ascii";
694 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200695 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200696 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200699 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200700 default:
701 return "<invalid compact kind>";
702 }
703}
704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705#ifdef Py_DEBUG
706int unicode_new_new_calls = 0;
707
708/* Functions wrapping macros for use in debugger */
709char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200710 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711}
712
713void *_PyUnicode_compact_data(void *unicode) {
714 return _PyUnicode_COMPACT_DATA(unicode);
715}
716void *_PyUnicode_data(void *unicode){
717 printf("obj %p\n", unicode);
718 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
719 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
720 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
721 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
722 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
723 return PyUnicode_DATA(unicode);
724}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200725
726void
727_PyUnicode_Dump(PyObject *op)
728{
729 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200730 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
731 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
732 void *data;
733 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
734 if (ascii->state.compact)
735 data = (compact + 1);
736 else
737 data = unicode->data.any;
738 if (ascii->wstr == data)
739 printf("shared ");
740 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200742 printf(" (%zu), ", compact->wstr_length);
743 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
744 printf("shared ");
745 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200747 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200748}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749#endif
750
751PyObject *
752PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
753{
754 PyObject *obj;
755 PyCompactUnicodeObject *unicode;
756 void *data;
757 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200758 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200759 Py_ssize_t char_size;
760 Py_ssize_t struct_size;
761
762 /* Optimization for empty strings */
763 if (size == 0 && unicode_empty != NULL) {
764 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200765 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200766 }
767
768#ifdef Py_DEBUG
769 ++unicode_new_new_calls;
770#endif
771
Victor Stinner9e9d6892011-10-04 01:02:02 +0200772 is_ascii = 0;
773 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774 struct_size = sizeof(PyCompactUnicodeObject);
775 if (maxchar < 128) {
776 kind_state = PyUnicode_1BYTE_KIND;
777 char_size = 1;
778 is_ascii = 1;
779 struct_size = sizeof(PyASCIIObject);
780 }
781 else if (maxchar < 256) {
782 kind_state = PyUnicode_1BYTE_KIND;
783 char_size = 1;
784 }
785 else if (maxchar < 65536) {
786 kind_state = PyUnicode_2BYTE_KIND;
787 char_size = 2;
788 if (sizeof(wchar_t) == 2)
789 is_sharing = 1;
790 }
791 else {
792 kind_state = PyUnicode_4BYTE_KIND;
793 char_size = 4;
794 if (sizeof(wchar_t) == 4)
795 is_sharing = 1;
796 }
797
798 /* Ensure we won't overflow the size. */
799 if (size < 0) {
800 PyErr_SetString(PyExc_SystemError,
801 "Negative size passed to PyUnicode_New");
802 return NULL;
803 }
804 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
805 return PyErr_NoMemory();
806
807 /* Duplicated allocation code from _PyObject_New() instead of a call to
808 * PyObject_New() so we are able to allocate space for the object and
809 * it's data buffer.
810 */
811 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
812 if (obj == NULL)
813 return PyErr_NoMemory();
814 obj = PyObject_INIT(obj, &PyUnicode_Type);
815 if (obj == NULL)
816 return NULL;
817
818 unicode = (PyCompactUnicodeObject *)obj;
819 if (is_ascii)
820 data = ((PyASCIIObject*)obj) + 1;
821 else
822 data = unicode + 1;
823 _PyUnicode_LENGTH(unicode) = size;
824 _PyUnicode_HASH(unicode) = -1;
825 _PyUnicode_STATE(unicode).interned = 0;
826 _PyUnicode_STATE(unicode).kind = kind_state;
827 _PyUnicode_STATE(unicode).compact = 1;
828 _PyUnicode_STATE(unicode).ready = 1;
829 _PyUnicode_STATE(unicode).ascii = is_ascii;
830 if (is_ascii) {
831 ((char*)data)[size] = 0;
832 _PyUnicode_WSTR(unicode) = NULL;
833 }
834 else if (kind_state == PyUnicode_1BYTE_KIND) {
835 ((char*)data)[size] = 0;
836 _PyUnicode_WSTR(unicode) = NULL;
837 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200839 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 }
841 else {
842 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200843 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 if (kind_state == PyUnicode_2BYTE_KIND)
845 ((Py_UCS2*)data)[size] = 0;
846 else /* kind_state == PyUnicode_4BYTE_KIND */
847 ((Py_UCS4*)data)[size] = 0;
848 if (is_sharing) {
849 _PyUnicode_WSTR_LENGTH(unicode) = size;
850 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
851 }
852 else {
853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
854 _PyUnicode_WSTR(unicode) = NULL;
855 }
856 }
857 return obj;
858}
859
860#if SIZEOF_WCHAR_T == 2
861/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
862 will decode surrogate pairs, the other conversions are implemented as macros
863 for efficency.
864
865 This function assumes that unicode can hold one more code point than wstr
866 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200867static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
869 PyUnicodeObject *unicode)
870{
871 const wchar_t *iter;
872 Py_UCS4 *ucs4_out;
873
Victor Stinner910337b2011-10-03 03:20:16 +0200874 assert(unicode != NULL);
875 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
877 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
878
879 for (iter = begin; iter < end; ) {
880 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
881 _PyUnicode_GET_LENGTH(unicode)));
882 if (*iter >= 0xD800 && *iter <= 0xDBFF
883 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
884 {
885 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
886 iter += 2;
887 }
888 else {
889 *ucs4_out++ = *iter;
890 iter++;
891 }
892 }
893 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
894 _PyUnicode_GET_LENGTH(unicode)));
895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897#endif
898
Victor Stinnercd9950f2011-10-02 00:34:53 +0200899static int
900_PyUnicode_Dirty(PyObject *unicode)
901{
Victor Stinner910337b2011-10-03 03:20:16 +0200902 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200903 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200904 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200905 "Cannot modify a string having more than 1 reference");
906 return -1;
907 }
908 _PyUnicode_DIRTY(unicode);
909 return 0;
910}
911
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200912Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
914 PyObject *from, Py_ssize_t from_start,
915 Py_ssize_t how_many)
916{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200917 unsigned int from_kind, to_kind;
918 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919
Victor Stinnerb1536152011-09-30 02:26:10 +0200920 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
921 PyErr_BadInternalCall();
922 return -1;
923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200924
925 if (PyUnicode_READY(from))
926 return -1;
927 if (PyUnicode_READY(to))
928 return -1;
929
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200930 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200931 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200932 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933 "Cannot write %zi characters at %zi "
934 "in a string of %zi characters",
935 how_many, to_start, PyUnicode_GET_LENGTH(to));
936 return -1;
937 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200938 if (how_many == 0)
939 return 0;
940
Victor Stinnercd9950f2011-10-02 00:34:53 +0200941 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200942 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200944 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200945 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200947 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200948
Victor Stinnerf42dc442011-10-02 23:33:16 +0200949 if (from_kind == to_kind
950 /* deny latin1 => ascii */
951 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
952 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200953 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200955 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956 + PyUnicode_KIND_SIZE(from_kind, from_start),
957 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200959 else if (from_kind == PyUnicode_1BYTE_KIND
960 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200961 {
962 _PyUnicode_CONVERT_BYTES(
963 Py_UCS1, Py_UCS2,
964 PyUnicode_1BYTE_DATA(from) + from_start,
965 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
966 PyUnicode_2BYTE_DATA(to) + to_start
967 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200968 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200969 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200970 && to_kind == PyUnicode_4BYTE_KIND)
971 {
972 _PyUnicode_CONVERT_BYTES(
973 Py_UCS1, Py_UCS4,
974 PyUnicode_1BYTE_DATA(from) + from_start,
975 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
976 PyUnicode_4BYTE_DATA(to) + to_start
977 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200978 }
979 else if (from_kind == PyUnicode_2BYTE_KIND
980 && to_kind == PyUnicode_4BYTE_KIND)
981 {
982 _PyUnicode_CONVERT_BYTES(
983 Py_UCS2, Py_UCS4,
984 PyUnicode_2BYTE_DATA(from) + from_start,
985 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
986 PyUnicode_4BYTE_DATA(to) + to_start
987 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200988 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200989 else {
990 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200991
992 /* check if max_char(from substring) <= max_char(to) */
993 if (from_kind > to_kind
994 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200995 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +0200996 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +0200997 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200998 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200999 /* slow path to check for character overflow */
1000 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1001 Py_UCS4 ch, maxchar;
1002 Py_ssize_t i;
1003
1004 maxchar = 0;
1005 invalid_kinds = 0;
1006 for (i=0; i < how_many; i++) {
1007 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1008 if (ch > maxchar) {
1009 maxchar = ch;
1010 if (maxchar > to_maxchar) {
1011 invalid_kinds = 1;
1012 break;
1013 }
1014 }
1015 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1016 }
1017 }
1018 else
1019 invalid_kinds = 1;
1020 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001021 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001022 "Cannot copy %s characters "
1023 "into a string of %s characters",
1024 unicode_kind_name(from),
1025 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001026 return -1;
1027 }
1028 }
1029 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030}
1031
Victor Stinner17222162011-09-28 22:15:37 +02001032/* Find the maximum code point and count the number of surrogate pairs so a
1033 correct string length can be computed before converting a string to UCS4.
1034 This function counts single surrogates as a character and not as a pair.
1035
1036 Return 0 on success, or -1 on error. */
1037static int
1038find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1039 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040{
1041 const wchar_t *iter;
1042
Victor Stinnerc53be962011-10-02 21:33:54 +02001043 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 if (num_surrogates == NULL || maxchar == NULL) {
1045 PyErr_SetString(PyExc_SystemError,
1046 "unexpected NULL arguments to "
1047 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1048 return -1;
1049 }
1050
1051 *num_surrogates = 0;
1052 *maxchar = 0;
1053
1054 for (iter = begin; iter < end; ) {
1055 if (*iter > *maxchar)
1056 *maxchar = *iter;
1057#if SIZEOF_WCHAR_T == 2
1058 if (*iter >= 0xD800 && *iter <= 0xDBFF
1059 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1060 {
1061 Py_UCS4 surrogate_val;
1062 surrogate_val = (((iter[0] & 0x3FF)<<10)
1063 | (iter[1] & 0x3FF)) + 0x10000;
1064 ++(*num_surrogates);
1065 if (surrogate_val > *maxchar)
1066 *maxchar = surrogate_val;
1067 iter += 2;
1068 }
1069 else
1070 iter++;
1071#else
1072 iter++;
1073#endif
1074 }
1075 return 0;
1076}
1077
1078#ifdef Py_DEBUG
1079int unicode_ready_calls = 0;
1080#endif
1081
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001082static int
1083unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001085 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 wchar_t *end;
1087 Py_UCS4 maxchar = 0;
1088 Py_ssize_t num_surrogates;
1089#if SIZEOF_WCHAR_T == 2
1090 Py_ssize_t length_wo_surrogates;
1091#endif
1092
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001093 assert(p_obj != NULL);
1094 unicode = (PyUnicodeObject *)*p_obj;
1095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001097 strings were created using _PyObject_New() and where no canonical
1098 representation (the str field) has been set yet aka strings
1099 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(_PyUnicode_CHECK(unicode));
1101 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001103 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001104 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001105 /* Actually, it should neither be interned nor be anything else: */
1106 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107
1108#ifdef Py_DEBUG
1109 ++unicode_ready_calls;
1110#endif
1111
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001112#ifdef Py_DEBUG
1113 assert(!replace || Py_REFCNT(unicode) == 1);
1114#else
1115 if (replace && Py_REFCNT(unicode) != 1)
1116 replace = 0;
1117#endif
1118 if (replace) {
1119 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1120 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1121 /* Optimization for empty strings */
1122 if (len == 0) {
1123 Py_INCREF(unicode_empty);
1124 Py_DECREF(*p_obj);
1125 *p_obj = unicode_empty;
1126 return 0;
1127 }
1128 if (len == 1 && wstr[0] < 256) {
1129 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1130 if (latin1_char == NULL)
1131 return -1;
1132 Py_DECREF(*p_obj);
1133 *p_obj = latin1_char;
1134 return 0;
1135 }
1136 }
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001139 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001140 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142
1143 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001144 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1145 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 PyErr_NoMemory();
1147 return -1;
1148 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001149 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 _PyUnicode_WSTR(unicode), end,
1151 PyUnicode_1BYTE_DATA(unicode));
1152 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1153 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1154 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1155 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001157 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001158 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 }
1160 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001161 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001162 _PyUnicode_UTF8(unicode) = NULL;
1163 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 }
1165 PyObject_FREE(_PyUnicode_WSTR(unicode));
1166 _PyUnicode_WSTR(unicode) = NULL;
1167 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1168 }
1169 /* In this case we might have to convert down from 4-byte native
1170 wchar_t to 2-byte unicode. */
1171 else if (maxchar < 65536) {
1172 assert(num_surrogates == 0 &&
1173 "FindMaxCharAndNumSurrogatePairs() messed up");
1174
Victor Stinner506f5922011-09-28 22:34:18 +02001175#if SIZEOF_WCHAR_T == 2
1176 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001177 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001178 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1179 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1180 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001181 _PyUnicode_UTF8(unicode) = NULL;
1182 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001183#else
1184 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001185 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001186 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001187 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001188 PyErr_NoMemory();
1189 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 }
Victor Stinner506f5922011-09-28 22:34:18 +02001191 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1192 _PyUnicode_WSTR(unicode), end,
1193 PyUnicode_2BYTE_DATA(unicode));
1194 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1195 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1196 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001197 _PyUnicode_UTF8(unicode) = NULL;
1198 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyObject_FREE(_PyUnicode_WSTR(unicode));
1200 _PyUnicode_WSTR(unicode) = NULL;
1201 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1202#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 }
1204 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1205 else {
1206#if SIZEOF_WCHAR_T == 2
1207 /* in case the native representation is 2-bytes, we need to allocate a
1208 new normalized 4-byte version. */
1209 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001210 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1211 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 PyErr_NoMemory();
1213 return -1;
1214 }
1215 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1216 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001217 _PyUnicode_UTF8(unicode) = NULL;
1218 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001219 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1220 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001221 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 PyObject_FREE(_PyUnicode_WSTR(unicode));
1223 _PyUnicode_WSTR(unicode) = NULL;
1224 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1225#else
1226 assert(num_surrogates == 0);
1227
Victor Stinnerc3c74152011-10-02 20:39:55 +02001228 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001230 _PyUnicode_UTF8(unicode) = NULL;
1231 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001232 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1233#endif
1234 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1235 }
1236 _PyUnicode_STATE(unicode).ready = 1;
1237 return 0;
1238}
1239
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001240int
1241_PyUnicode_ReadyReplace(PyObject **op)
1242{
1243 return unicode_ready(op, 1);
1244}
1245
1246int
1247_PyUnicode_Ready(PyObject *op)
1248{
1249 return unicode_ready(&op, 0);
1250}
1251
Alexander Belopolsky40018472011-02-26 01:02:56 +00001252static void
1253unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254{
Walter Dörwald16807132007-05-25 13:52:07 +00001255 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001256 case SSTATE_NOT_INTERNED:
1257 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001258
Benjamin Peterson29060642009-01-31 22:14:21 +00001259 case SSTATE_INTERNED_MORTAL:
1260 /* revive dead object temporarily for DelItem */
1261 Py_REFCNT(unicode) = 3;
1262 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1263 Py_FatalError(
1264 "deletion of interned string failed");
1265 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001266
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 case SSTATE_INTERNED_IMMORTAL:
1268 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001269
Benjamin Peterson29060642009-01-31 22:14:21 +00001270 default:
1271 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001272 }
1273
Victor Stinner03490912011-10-03 23:45:12 +02001274 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001276 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001277 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278
1279 if (PyUnicode_IS_COMPACT(unicode)) {
1280 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 }
1282 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001283 if (_PyUnicode_DATA_ANY(unicode))
1284 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001285 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
1287}
1288
Alexander Belopolsky40018472011-02-26 01:02:56 +00001289static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001290unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001291{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001292 if (Py_REFCNT(unicode) != 1)
1293 return 0;
1294 if (PyUnicode_CHECK_INTERNED(unicode))
1295 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001296 assert (unicode != unicode_empty);
1297#ifdef Py_DEBUG
1298 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1299 && PyUnicode_GET_LENGTH(unicode) == 1)
1300 {
1301 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001302 if (ch < 256 && unicode_latin1[ch] == unicode)
1303 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001304 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001305#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001306 return 1;
1307}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001308
Victor Stinnerfe226c02011-10-03 03:52:20 +02001309static int
1310unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1311{
1312 PyObject *unicode;
1313 Py_ssize_t old_length;
1314
1315 assert(p_unicode != NULL);
1316 unicode = *p_unicode;
1317
1318 assert(unicode != NULL);
1319 assert(PyUnicode_Check(unicode));
1320 assert(0 <= length);
1321
Victor Stinner910337b2011-10-03 03:20:16 +02001322 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 old_length = PyUnicode_WSTR_LENGTH(unicode);
1324 else
1325 old_length = PyUnicode_GET_LENGTH(unicode);
1326 if (old_length == length)
1327 return 0;
1328
Victor Stinnerfe226c02011-10-03 03:52:20 +02001329 if (!unicode_resizable(unicode)) {
1330 PyObject *copy = resize_copy(unicode, length);
1331 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 Py_DECREF(*p_unicode);
1334 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001336 }
1337
Victor Stinnerfe226c02011-10-03 03:52:20 +02001338 if (PyUnicode_IS_COMPACT(unicode)) {
1339 *p_unicode = resize_compact(unicode, length);
1340 if (*p_unicode == NULL)
1341 return -1;
1342 return 0;
1343 } else
1344 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001345}
1346
Alexander Belopolsky40018472011-02-26 01:02:56 +00001347int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001348PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001349{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 PyObject *unicode;
1351 if (p_unicode == NULL) {
1352 PyErr_BadInternalCall();
1353 return -1;
1354 }
1355 unicode = *p_unicode;
1356 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1357 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1358 {
1359 PyErr_BadInternalCall();
1360 return -1;
1361 }
1362 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001363}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365static PyObject*
1366get_latin1_char(unsigned char ch)
1367{
Victor Stinnera464fc12011-10-02 20:39:30 +02001368 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001370 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 if (!unicode)
1372 return NULL;
1373 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1374 unicode_latin1[ch] = unicode;
1375 }
1376 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001377 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
1383 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 Py_UCS4 maxchar = 0;
1385 Py_ssize_t num_surrogates;
1386
1387 if (u == NULL)
1388 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001390 /* If the Unicode data is known at construction time, we can apply
1391 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 /* Optimization for empty strings */
1394 if (size == 0 && unicode_empty != NULL) {
1395 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001396 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001397 }
Tim Petersced69f82003-09-16 20:30:58 +00001398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 /* Single character Unicode objects in the Latin-1 range are
1400 shared when using this constructor */
1401 if (size == 1 && *u < 256)
1402 return get_latin1_char((unsigned char)*u);
1403
1404 /* If not empty and not single character, copy the Unicode data
1405 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001406 if (find_maxchar_surrogates(u, u + size,
1407 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 return NULL;
1409
1410 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1411 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 if (!unicode)
1413 return NULL;
1414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 switch (PyUnicode_KIND(unicode)) {
1416 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001417 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1419 break;
1420 case PyUnicode_2BYTE_KIND:
1421#if Py_UNICODE_SIZE == 2
1422 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1423#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1426#endif
1427 break;
1428 case PyUnicode_4BYTE_KIND:
1429#if SIZEOF_WCHAR_T == 2
1430 /* This is the only case which has to process surrogates, thus
1431 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001432 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433#else
1434 assert(num_surrogates == 0);
1435 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1436#endif
1437 break;
1438 default:
1439 assert(0 && "Impossible state");
1440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441
1442 return (PyObject *)unicode;
1443}
1444
Alexander Belopolsky40018472011-02-26 01:02:56 +00001445PyObject *
1446PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001447{
1448 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001449
Benjamin Peterson14339b62009-01-31 16:36:08 +00001450 if (size < 0) {
1451 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001453 return NULL;
1454 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001455
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001456 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001457 some optimizations which share commonly used objects.
1458 Also, this means the input must be UTF-8, so fall back to the
1459 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001460 if (u != NULL) {
1461
Benjamin Peterson29060642009-01-31 22:14:21 +00001462 /* Optimization for empty strings */
1463 if (size == 0 && unicode_empty != NULL) {
1464 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001465 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001466 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001467
1468 /* Single characters are shared when using this constructor.
1469 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 if (size == 1 && Py_CHARMASK(*u) < 128)
1471 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001472
1473 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001474 }
1475
Walter Dörwald55507312007-05-18 13:12:10 +00001476 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001477 if (!unicode)
1478 return NULL;
1479
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001480 return (PyObject *)unicode;
1481}
1482
Alexander Belopolsky40018472011-02-26 01:02:56 +00001483PyObject *
1484PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001485{
1486 size_t size = strlen(u);
1487 if (size > PY_SSIZE_T_MAX) {
1488 PyErr_SetString(PyExc_OverflowError, "input too long");
1489 return NULL;
1490 }
1491
1492 return PyUnicode_FromStringAndSize(u, size);
1493}
1494
Victor Stinnere57b1c02011-09-28 22:20:48 +02001495static PyObject*
1496_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyObject *res;
1499 unsigned char max = 127;
1500 Py_ssize_t i;
1501 for (i = 0; i < size; i++) {
1502 if (u[i] & 0x80) {
1503 max = 255;
1504 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001505 }
1506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 res = PyUnicode_New(size, max);
1508 if (!res)
1509 return NULL;
1510 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1511 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001512}
1513
Victor Stinnere57b1c02011-09-28 22:20:48 +02001514static PyObject*
1515_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516{
1517 PyObject *res;
1518 Py_UCS2 max = 0;
1519 Py_ssize_t i;
1520 for (i = 0; i < size; i++)
1521 if (u[i] > max)
1522 max = u[i];
1523 res = PyUnicode_New(size, max);
1524 if (!res)
1525 return NULL;
1526 if (max >= 256)
1527 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1528 else
1529 for (i = 0; i < size; i++)
1530 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1531 return res;
1532}
1533
Victor Stinnere57b1c02011-09-28 22:20:48 +02001534static PyObject*
1535_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536{
1537 PyObject *res;
1538 Py_UCS4 max = 0;
1539 Py_ssize_t i;
1540 for (i = 0; i < size; i++)
1541 if (u[i] > max)
1542 max = u[i];
1543 res = PyUnicode_New(size, max);
1544 if (!res)
1545 return NULL;
1546 if (max >= 0x10000)
1547 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1548 else {
1549 int kind = PyUnicode_KIND(res);
1550 void *data = PyUnicode_DATA(res);
1551 for (i = 0; i < size; i++)
1552 PyUnicode_WRITE(kind, data, i, u[i]);
1553 }
1554 return res;
1555}
1556
1557PyObject*
1558PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1559{
1560 switch(kind) {
1561 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001562 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001564 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001566 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 }
Victor Stinner01698042011-10-04 00:04:26 +02001568 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 return NULL;
1570}
1571
Victor Stinner034f6cf2011-09-30 02:26:44 +02001572PyObject*
1573PyUnicode_Copy(PyObject *unicode)
1574{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001575 Py_ssize_t size;
1576 PyObject *copy;
1577 void *data;
1578
Victor Stinner034f6cf2011-09-30 02:26:44 +02001579 if (!PyUnicode_Check(unicode)) {
1580 PyErr_BadInternalCall();
1581 return NULL;
1582 }
1583 if (PyUnicode_READY(unicode))
1584 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001585
1586 size = PyUnicode_GET_LENGTH(unicode);
1587 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1588 if (!copy)
1589 return NULL;
1590 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1591
1592 data = PyUnicode_DATA(unicode);
1593 switch (PyUnicode_KIND(unicode))
1594 {
1595 case PyUnicode_1BYTE_KIND:
1596 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1597 break;
1598 case PyUnicode_2BYTE_KIND:
1599 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1600 break;
1601 case PyUnicode_4BYTE_KIND:
1602 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1603 break;
1604 default:
1605 assert(0);
1606 break;
1607 }
1608 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001609}
1610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611
Victor Stinnerbc603d12011-10-02 01:00:40 +02001612/* Widen Unicode objects to larger buffers. Don't write terminating null
1613 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614
1615void*
1616_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1617{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001618 Py_ssize_t len;
1619 void *result;
1620 unsigned int skind;
1621
1622 if (PyUnicode_READY(s))
1623 return NULL;
1624
1625 len = PyUnicode_GET_LENGTH(s);
1626 skind = PyUnicode_KIND(s);
1627 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001628 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 return NULL;
1630 }
1631 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001632 case PyUnicode_2BYTE_KIND:
1633 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1634 if (!result)
1635 return PyErr_NoMemory();
1636 assert(skind == PyUnicode_1BYTE_KIND);
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS1, Py_UCS2,
1639 PyUnicode_1BYTE_DATA(s),
1640 PyUnicode_1BYTE_DATA(s) + len,
1641 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001643 case PyUnicode_4BYTE_KIND:
1644 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1645 if (!result)
1646 return PyErr_NoMemory();
1647 if (skind == PyUnicode_2BYTE_KIND) {
1648 _PyUnicode_CONVERT_BYTES(
1649 Py_UCS2, Py_UCS4,
1650 PyUnicode_2BYTE_DATA(s),
1651 PyUnicode_2BYTE_DATA(s) + len,
1652 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001654 else {
1655 assert(skind == PyUnicode_1BYTE_KIND);
1656 _PyUnicode_CONVERT_BYTES(
1657 Py_UCS1, Py_UCS4,
1658 PyUnicode_1BYTE_DATA(s),
1659 PyUnicode_1BYTE_DATA(s) + len,
1660 result);
1661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001663 default:
1664 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 }
Victor Stinner01698042011-10-04 00:04:26 +02001666 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return NULL;
1668}
1669
1670static Py_UCS4*
1671as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1672 int copy_null)
1673{
1674 int kind;
1675 void *data;
1676 Py_ssize_t len, targetlen;
1677 if (PyUnicode_READY(string) == -1)
1678 return NULL;
1679 kind = PyUnicode_KIND(string);
1680 data = PyUnicode_DATA(string);
1681 len = PyUnicode_GET_LENGTH(string);
1682 targetlen = len;
1683 if (copy_null)
1684 targetlen++;
1685 if (!target) {
1686 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1687 PyErr_NoMemory();
1688 return NULL;
1689 }
1690 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1691 if (!target) {
1692 PyErr_NoMemory();
1693 return NULL;
1694 }
1695 }
1696 else {
1697 if (targetsize < targetlen) {
1698 PyErr_Format(PyExc_SystemError,
1699 "string is longer than the buffer");
1700 if (copy_null && 0 < targetsize)
1701 target[0] = 0;
1702 return NULL;
1703 }
1704 }
1705 if (kind != PyUnicode_4BYTE_KIND) {
1706 Py_ssize_t i;
1707 for (i = 0; i < len; i++)
1708 target[i] = PyUnicode_READ(kind, data, i);
1709 }
1710 else
1711 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1712 if (copy_null)
1713 target[len] = 0;
1714 return target;
1715}
1716
1717Py_UCS4*
1718PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1719 int copy_null)
1720{
1721 if (target == NULL || targetsize < 1) {
1722 PyErr_BadInternalCall();
1723 return NULL;
1724 }
1725 return as_ucs4(string, target, targetsize, copy_null);
1726}
1727
1728Py_UCS4*
1729PyUnicode_AsUCS4Copy(PyObject *string)
1730{
1731 return as_ucs4(string, NULL, 0, 1);
1732}
1733
1734#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001735
Alexander Belopolsky40018472011-02-26 01:02:56 +00001736PyObject *
1737PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001740 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 PyErr_BadInternalCall();
1743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 }
1745
Martin v. Löwis790465f2008-04-05 20:41:37 +00001746 if (size == -1) {
1747 size = wcslen(w);
1748 }
1749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751}
1752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001754
Walter Dörwald346737f2007-05-31 10:44:43 +00001755static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001756makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1757 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001759 *fmt++ = '%';
1760 if (width) {
1761 if (zeropad)
1762 *fmt++ = '0';
1763 fmt += sprintf(fmt, "%d", width);
1764 }
1765 if (precision)
1766 fmt += sprintf(fmt, ".%d", precision);
1767 if (longflag)
1768 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001769 else if (longlongflag) {
1770 /* longlongflag should only ever be nonzero on machines with
1771 HAVE_LONG_LONG defined */
1772#ifdef HAVE_LONG_LONG
1773 char *f = PY_FORMAT_LONG_LONG;
1774 while (*f)
1775 *fmt++ = *f++;
1776#else
1777 /* we shouldn't ever get here */
1778 assert(0);
1779 *fmt++ = 'l';
1780#endif
1781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 else if (size_tflag) {
1783 char *f = PY_FORMAT_SIZE_T;
1784 while (*f)
1785 *fmt++ = *f++;
1786 }
1787 *fmt++ = c;
1788 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001789}
1790
Victor Stinner96865452011-03-01 23:44:09 +00001791/* helper for PyUnicode_FromFormatV() */
1792
1793static const char*
1794parse_format_flags(const char *f,
1795 int *p_width, int *p_precision,
1796 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1797{
1798 int width, precision, longflag, longlongflag, size_tflag;
1799
1800 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1801 f++;
1802 width = 0;
1803 while (Py_ISDIGIT((unsigned)*f))
1804 width = (width*10) + *f++ - '0';
1805 precision = 0;
1806 if (*f == '.') {
1807 f++;
1808 while (Py_ISDIGIT((unsigned)*f))
1809 precision = (precision*10) + *f++ - '0';
1810 if (*f == '%') {
1811 /* "%.3%s" => f points to "3" */
1812 f--;
1813 }
1814 }
1815 if (*f == '\0') {
1816 /* bogus format "%.1" => go backward, f points to "1" */
1817 f--;
1818 }
1819 if (p_width != NULL)
1820 *p_width = width;
1821 if (p_precision != NULL)
1822 *p_precision = precision;
1823
1824 /* Handle %ld, %lu, %lld and %llu. */
1825 longflag = 0;
1826 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001827 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001828
1829 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001830 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001831 longflag = 1;
1832 ++f;
1833 }
1834#ifdef HAVE_LONG_LONG
1835 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001836 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001837 longlongflag = 1;
1838 f += 2;
1839 }
1840#endif
1841 }
1842 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001843 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001844 size_tflag = 1;
1845 ++f;
1846 }
1847 if (p_longflag != NULL)
1848 *p_longflag = longflag;
1849 if (p_longlongflag != NULL)
1850 *p_longlongflag = longlongflag;
1851 if (p_size_tflag != NULL)
1852 *p_size_tflag = size_tflag;
1853 return f;
1854}
1855
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001856/* maximum number of characters required for output of %ld. 21 characters
1857 allows for 64-bit integers (in decimal) and an optional sign. */
1858#define MAX_LONG_CHARS 21
1859/* maximum number of characters required for output of %lld.
1860 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1861 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1862#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1863
Walter Dörwaldd2034312007-05-18 16:29:38 +00001864PyObject *
1865PyUnicode_FromFormatV(const char *format, va_list vargs)
1866{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001867 va_list count;
1868 Py_ssize_t callcount = 0;
1869 PyObject **callresults = NULL;
1870 PyObject **callresult = NULL;
1871 Py_ssize_t n = 0;
1872 int width = 0;
1873 int precision = 0;
1874 int zeropad;
1875 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001878 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1880 Py_UCS4 argmaxchar;
1881 Py_ssize_t numbersize = 0;
1882 char *numberresults = NULL;
1883 char *numberresult = NULL;
1884 Py_ssize_t i;
1885 int kind;
1886 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001887
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001888 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001889 /* step 1: count the number of %S/%R/%A/%s format specifications
1890 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1891 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 * result in an array)
1893 * also esimate a upper bound for all the number formats in the string,
1894 * numbers will be formated in step 3 and be keept in a '\0'-separated
1895 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 for (f = format; *f; f++) {
1897 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001898 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1900 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1901 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1902 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001905#ifdef HAVE_LONG_LONG
1906 if (longlongflag) {
1907 if (width < MAX_LONG_LONG_CHARS)
1908 width = MAX_LONG_LONG_CHARS;
1909 }
1910 else
1911#endif
1912 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1913 including sign. Decimal takes the most space. This
1914 isn't enough for octal. If a width is specified we
1915 need more (which we allocate later). */
1916 if (width < MAX_LONG_CHARS)
1917 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918
1919 /* account for the size + '\0' to separate numbers
1920 inside of the numberresults buffer */
1921 numbersize += (width + 1);
1922 }
1923 }
1924 else if ((unsigned char)*f > 127) {
1925 PyErr_Format(PyExc_ValueError,
1926 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1927 "string, got a non-ASCII byte: 0x%02x",
1928 (unsigned char)*f);
1929 return NULL;
1930 }
1931 }
1932 /* step 2: allocate memory for the results of
1933 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1934 if (callcount) {
1935 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1936 if (!callresults) {
1937 PyErr_NoMemory();
1938 return NULL;
1939 }
1940 callresult = callresults;
1941 }
1942 /* step 2.5: allocate memory for the results of formating numbers */
1943 if (numbersize) {
1944 numberresults = PyObject_Malloc(numbersize);
1945 if (!numberresults) {
1946 PyErr_NoMemory();
1947 goto fail;
1948 }
1949 numberresult = numberresults;
1950 }
1951
1952 /* step 3: format numbers and figure out how large a buffer we need */
1953 for (f = format; *f; f++) {
1954 if (*f == '%') {
1955 const char* p;
1956 int longflag;
1957 int longlongflag;
1958 int size_tflag;
1959 int numprinted;
1960
1961 p = f;
1962 zeropad = (f[1] == '0');
1963 f = parse_format_flags(f, &width, &precision,
1964 &longflag, &longlongflag, &size_tflag);
1965 switch (*f) {
1966 case 'c':
1967 {
1968 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001969 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 n++;
1971 break;
1972 }
1973 case '%':
1974 n++;
1975 break;
1976 case 'i':
1977 case 'd':
1978 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1979 width, precision, *f);
1980 if (longflag)
1981 numprinted = sprintf(numberresult, fmt,
1982 va_arg(count, long));
1983#ifdef HAVE_LONG_LONG
1984 else if (longlongflag)
1985 numprinted = sprintf(numberresult, fmt,
1986 va_arg(count, PY_LONG_LONG));
1987#endif
1988 else if (size_tflag)
1989 numprinted = sprintf(numberresult, fmt,
1990 va_arg(count, Py_ssize_t));
1991 else
1992 numprinted = sprintf(numberresult, fmt,
1993 va_arg(count, int));
1994 n += numprinted;
1995 /* advance by +1 to skip over the '\0' */
1996 numberresult += (numprinted + 1);
1997 assert(*(numberresult - 1) == '\0');
1998 assert(*(numberresult - 2) != '\0');
1999 assert(numprinted >= 0);
2000 assert(numberresult <= numberresults + numbersize);
2001 break;
2002 case 'u':
2003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2004 width, precision, 'u');
2005 if (longflag)
2006 numprinted = sprintf(numberresult, fmt,
2007 va_arg(count, unsigned long));
2008#ifdef HAVE_LONG_LONG
2009 else if (longlongflag)
2010 numprinted = sprintf(numberresult, fmt,
2011 va_arg(count, unsigned PY_LONG_LONG));
2012#endif
2013 else if (size_tflag)
2014 numprinted = sprintf(numberresult, fmt,
2015 va_arg(count, size_t));
2016 else
2017 numprinted = sprintf(numberresult, fmt,
2018 va_arg(count, unsigned int));
2019 n += numprinted;
2020 numberresult += (numprinted + 1);
2021 assert(*(numberresult - 1) == '\0');
2022 assert(*(numberresult - 2) != '\0');
2023 assert(numprinted >= 0);
2024 assert(numberresult <= numberresults + numbersize);
2025 break;
2026 case 'x':
2027 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2028 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2029 n += numprinted;
2030 numberresult += (numprinted + 1);
2031 assert(*(numberresult - 1) == '\0');
2032 assert(*(numberresult - 2) != '\0');
2033 assert(numprinted >= 0);
2034 assert(numberresult <= numberresults + numbersize);
2035 break;
2036 case 'p':
2037 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2038 /* %p is ill-defined: ensure leading 0x. */
2039 if (numberresult[1] == 'X')
2040 numberresult[1] = 'x';
2041 else if (numberresult[1] != 'x') {
2042 memmove(numberresult + 2, numberresult,
2043 strlen(numberresult) + 1);
2044 numberresult[0] = '0';
2045 numberresult[1] = 'x';
2046 numprinted += 2;
2047 }
2048 n += numprinted;
2049 numberresult += (numprinted + 1);
2050 assert(*(numberresult - 1) == '\0');
2051 assert(*(numberresult - 2) != '\0');
2052 assert(numprinted >= 0);
2053 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002054 break;
2055 case 's':
2056 {
2057 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002058 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002059 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2060 if (!str)
2061 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* since PyUnicode_DecodeUTF8 returns already flexible
2063 unicode objects, there is no need to call ready on them */
2064 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002065 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002067 /* Remember the str and switch to the next slot */
2068 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002069 break;
2070 }
2071 case 'U':
2072 {
2073 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002074 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 if (PyUnicode_READY(obj) == -1)
2076 goto fail;
2077 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002078 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 break;
2081 }
2082 case 'V':
2083 {
2084 PyObject *obj = va_arg(count, PyObject *);
2085 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002086 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002087 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002088 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002089 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 if (PyUnicode_READY(obj) == -1)
2091 goto fail;
2092 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002093 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002095 *callresult++ = NULL;
2096 }
2097 else {
2098 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2099 if (!str_obj)
2100 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002102 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002104 *callresult++ = str_obj;
2105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002106 break;
2107 }
2108 case 'S':
2109 {
2110 PyObject *obj = va_arg(count, PyObject *);
2111 PyObject *str;
2112 assert(obj);
2113 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002117 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002119 /* Remember the str and switch to the next slot */
2120 *callresult++ = str;
2121 break;
2122 }
2123 case 'R':
2124 {
2125 PyObject *obj = va_arg(count, PyObject *);
2126 PyObject *repr;
2127 assert(obj);
2128 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002130 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002132 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 /* Remember the repr and switch to the next slot */
2135 *callresult++ = repr;
2136 break;
2137 }
2138 case 'A':
2139 {
2140 PyObject *obj = va_arg(count, PyObject *);
2141 PyObject *ascii;
2142 assert(obj);
2143 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002145 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002147 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002149 /* Remember the repr and switch to the next slot */
2150 *callresult++ = ascii;
2151 break;
2152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002153 default:
2154 /* if we stumble upon an unknown
2155 formatting code, copy the rest of
2156 the format string to the output
2157 string. (we cannot just skip the
2158 code, since there's no way to know
2159 what's in the argument list) */
2160 n += strlen(p);
2161 goto expand;
2162 }
2163 } else
2164 n++;
2165 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 we don't have to resize the string.
2170 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002172 if (!string)
2173 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 kind = PyUnicode_KIND(string);
2175 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002176 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002180 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002181 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002182
2183 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2185 /* checking for == because the last argument could be a empty
2186 string, which causes i to point to end, the assert at the end of
2187 the loop */
2188 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002189
Benjamin Peterson14339b62009-01-31 16:36:08 +00002190 switch (*f) {
2191 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002192 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 const int ordinal = va_arg(vargs, int);
2194 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002196 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002197 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 case 'p':
2202 /* unused, since we already have the result */
2203 if (*f == 'p')
2204 (void) va_arg(vargs, void *);
2205 else
2206 (void) va_arg(vargs, int);
2207 /* extract the result from numberresults and append. */
2208 for (; *numberresult; ++i, ++numberresult)
2209 PyUnicode_WRITE(kind, data, i, *numberresult);
2210 /* skip over the separating '\0' */
2211 assert(*numberresult == '\0');
2212 numberresult++;
2213 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 break;
2215 case 's':
2216 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002217 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002219 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 size = PyUnicode_GET_LENGTH(*callresult);
2221 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002222 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2223 *callresult, 0,
2224 size) < 0)
2225 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002227 /* We're done with the unicode()/repr() => forget it */
2228 Py_DECREF(*callresult);
2229 /* switch to next unicode()/repr() result */
2230 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002231 break;
2232 }
2233 case 'U':
2234 {
2235 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 Py_ssize_t size;
2237 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2238 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002239 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2240 obj, 0,
2241 size) < 0)
2242 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 break;
2245 }
2246 case 'V':
2247 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002250 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 size = PyUnicode_GET_LENGTH(obj);
2253 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002254 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2255 obj, 0,
2256 size) < 0)
2257 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002259 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 size = PyUnicode_GET_LENGTH(*callresult);
2261 assert(PyUnicode_KIND(*callresult) <=
2262 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002263 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2264 *callresult,
2265 0, size) < 0)
2266 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002268 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002270 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 break;
2272 }
2273 case 'S':
2274 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002275 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002276 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 /* unused, since we already have the result */
2278 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002280 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2281 *callresult, 0,
2282 PyUnicode_GET_LENGTH(*callresult)) < 0)
2283 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 /* We're done with the unicode()/repr() => forget it */
2286 Py_DECREF(*callresult);
2287 /* switch to next unicode()/repr() result */
2288 ++callresult;
2289 break;
2290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002291 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 break;
2294 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 for (; *p; ++p, ++i)
2296 PyUnicode_WRITE(kind, data, i, *p);
2297 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 goto end;
2299 }
Victor Stinner1205f272010-09-11 00:54:47 +00002300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 else {
2302 assert(i < PyUnicode_GET_LENGTH(string));
2303 PyUnicode_WRITE(kind, data, i++, *f);
2304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002307
Benjamin Peterson29060642009-01-31 22:14:21 +00002308 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002309 if (callresults)
2310 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 if (numberresults)
2312 PyObject_Free(numberresults);
2313 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002314 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 if (callresults) {
2316 PyObject **callresult2 = callresults;
2317 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002318 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 ++callresult2;
2320 }
2321 PyObject_Free(callresults);
2322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 if (numberresults)
2324 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002326}
2327
Walter Dörwaldd2034312007-05-18 16:29:38 +00002328PyObject *
2329PyUnicode_FromFormat(const char *format, ...)
2330{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 PyObject* ret;
2332 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002333
2334#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002336#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002338#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002339 ret = PyUnicode_FromFormatV(format, vargs);
2340 va_end(vargs);
2341 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002342}
2343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344#ifdef HAVE_WCHAR_H
2345
Victor Stinner5593d8a2010-10-02 11:11:27 +00002346/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2347 convert a Unicode object to a wide character string.
2348
Victor Stinnerd88d9832011-09-06 02:00:05 +02002349 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002350 character) required to convert the unicode object. Ignore size argument.
2351
Victor Stinnerd88d9832011-09-06 02:00:05 +02002352 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002353 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002354 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002355static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002356unicode_aswidechar(PyUnicodeObject *unicode,
2357 wchar_t *w,
2358 Py_ssize_t size)
2359{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002360 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 const wchar_t *wstr;
2362
2363 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2364 if (wstr == NULL)
2365 return -1;
2366
Victor Stinner5593d8a2010-10-02 11:11:27 +00002367 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002368 if (size > res)
2369 size = res + 1;
2370 else
2371 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002373 return res;
2374 }
2375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002377}
2378
2379Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002380PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002381 wchar_t *w,
2382 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383{
2384 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 PyErr_BadInternalCall();
2386 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002388 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389}
2390
Victor Stinner137c34c2010-09-29 10:25:54 +00002391wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002392PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002393 Py_ssize_t *size)
2394{
2395 wchar_t* buffer;
2396 Py_ssize_t buflen;
2397
2398 if (unicode == NULL) {
2399 PyErr_BadInternalCall();
2400 return NULL;
2401 }
2402
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002403 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 if (buflen == -1)
2405 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002406 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002407 PyErr_NoMemory();
2408 return NULL;
2409 }
2410
Victor Stinner137c34c2010-09-29 10:25:54 +00002411 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2412 if (buffer == NULL) {
2413 PyErr_NoMemory();
2414 return NULL;
2415 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002416 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 if (buflen == -1)
2418 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002419 if (size != NULL)
2420 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002421 return buffer;
2422}
2423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425
Alexander Belopolsky40018472011-02-26 01:02:56 +00002426PyObject *
2427PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002430 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 PyErr_SetString(PyExc_ValueError,
2432 "chr() arg not in range(0x110000)");
2433 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002434 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 if (ordinal < 256)
2437 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 v = PyUnicode_New(1, ordinal);
2440 if (v == NULL)
2441 return NULL;
2442 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2443 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002444}
2445
Alexander Belopolsky40018472011-02-26 01:02:56 +00002446PyObject *
2447PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002449 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002451 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002452 if (PyUnicode_READY(obj))
2453 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002454 Py_INCREF(obj);
2455 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002456 }
2457 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 /* For a Unicode subtype that's not a Unicode object,
2459 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002460 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002461 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002462 PyErr_Format(PyExc_TypeError,
2463 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002464 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002465 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002466}
2467
Alexander Belopolsky40018472011-02-26 01:02:56 +00002468PyObject *
2469PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002470 const char *encoding,
2471 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002472{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002473 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002474 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002475
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 PyErr_BadInternalCall();
2478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002480
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002481 /* Decoding bytes objects is the most common case and should be fast */
2482 if (PyBytes_Check(obj)) {
2483 if (PyBytes_GET_SIZE(obj) == 0) {
2484 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002485 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002486 }
2487 else {
2488 v = PyUnicode_Decode(
2489 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2490 encoding, errors);
2491 }
2492 return v;
2493 }
2494
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002495 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 PyErr_SetString(PyExc_TypeError,
2497 "decoding str is not supported");
2498 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002500
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002501 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2502 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2503 PyErr_Format(PyExc_TypeError,
2504 "coercing to str: need bytes, bytearray "
2505 "or buffer-like object, %.80s found",
2506 Py_TYPE(obj)->tp_name);
2507 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002508 }
Tim Petersced69f82003-09-16 20:30:58 +00002509
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002510 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002511 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002512 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 }
Tim Petersced69f82003-09-16 20:30:58 +00002514 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002515 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002516
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002517 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002518 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519}
2520
Victor Stinner600d3be2010-06-10 12:00:55 +00002521/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002522 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2523 1 on success. */
2524static int
2525normalize_encoding(const char *encoding,
2526 char *lower,
2527 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002529 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002530 char *l;
2531 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002533 e = encoding;
2534 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002535 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002536 while (*e) {
2537 if (l == l_end)
2538 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002539 if (Py_ISUPPER(*e)) {
2540 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002541 }
2542 else if (*e == '_') {
2543 *l++ = '-';
2544 e++;
2545 }
2546 else {
2547 *l++ = *e++;
2548 }
2549 }
2550 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002551 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002552}
2553
Alexander Belopolsky40018472011-02-26 01:02:56 +00002554PyObject *
2555PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002556 Py_ssize_t size,
2557 const char *encoding,
2558 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002559{
2560 PyObject *buffer = NULL, *unicode;
2561 Py_buffer info;
2562 char lower[11]; /* Enough for any encoding shortcut */
2563
2564 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002565 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002566
2567 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002568 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002569 if ((strcmp(lower, "utf-8") == 0) ||
2570 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002571 return PyUnicode_DecodeUTF8(s, size, errors);
2572 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002573 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002574 (strcmp(lower, "iso-8859-1") == 0))
2575 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002576#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002577 else if (strcmp(lower, "mbcs") == 0)
2578 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002579#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002580 else if (strcmp(lower, "ascii") == 0)
2581 return PyUnicode_DecodeASCII(s, size, errors);
2582 else if (strcmp(lower, "utf-16") == 0)
2583 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2584 else if (strcmp(lower, "utf-32") == 0)
2585 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587
2588 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002589 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002590 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002591 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002592 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 if (buffer == NULL)
2594 goto onError;
2595 unicode = PyCodec_Decode(buffer, encoding, errors);
2596 if (unicode == NULL)
2597 goto onError;
2598 if (!PyUnicode_Check(unicode)) {
2599 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002600 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002601 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 Py_DECREF(unicode);
2603 goto onError;
2604 }
2605 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002606 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_DECREF(unicode);
2608 return NULL;
2609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002611
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 Py_XDECREF(buffer);
2614 return NULL;
2615}
2616
Alexander Belopolsky40018472011-02-26 01:02:56 +00002617PyObject *
2618PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002619 const char *encoding,
2620 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002621{
2622 PyObject *v;
2623
2624 if (!PyUnicode_Check(unicode)) {
2625 PyErr_BadArgument();
2626 goto onError;
2627 }
2628
2629 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002631
2632 /* Decode via the codec registry */
2633 v = PyCodec_Decode(unicode, encoding, errors);
2634 if (v == NULL)
2635 goto onError;
2636 return v;
2637
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002639 return NULL;
2640}
2641
Alexander Belopolsky40018472011-02-26 01:02:56 +00002642PyObject *
2643PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002644 const char *encoding,
2645 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002646{
2647 PyObject *v;
2648
2649 if (!PyUnicode_Check(unicode)) {
2650 PyErr_BadArgument();
2651 goto onError;
2652 }
2653
2654 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002655 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002656
2657 /* Decode via the codec registry */
2658 v = PyCodec_Decode(unicode, encoding, errors);
2659 if (v == NULL)
2660 goto onError;
2661 if (!PyUnicode_Check(v)) {
2662 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002663 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002664 Py_TYPE(v)->tp_name);
2665 Py_DECREF(v);
2666 goto onError;
2667 }
2668 return v;
2669
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002671 return NULL;
2672}
2673
Alexander Belopolsky40018472011-02-26 01:02:56 +00002674PyObject *
2675PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002676 Py_ssize_t size,
2677 const char *encoding,
2678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679{
2680 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002681
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 unicode = PyUnicode_FromUnicode(s, size);
2683 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2686 Py_DECREF(unicode);
2687 return v;
2688}
2689
Alexander Belopolsky40018472011-02-26 01:02:56 +00002690PyObject *
2691PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002692 const char *encoding,
2693 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002694{
2695 PyObject *v;
2696
2697 if (!PyUnicode_Check(unicode)) {
2698 PyErr_BadArgument();
2699 goto onError;
2700 }
2701
2702 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002703 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002704
2705 /* Encode via the codec registry */
2706 v = PyCodec_Encode(unicode, encoding, errors);
2707 if (v == NULL)
2708 goto onError;
2709 return v;
2710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002712 return NULL;
2713}
2714
Victor Stinnerad158722010-10-27 00:25:46 +00002715PyObject *
2716PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002717{
Victor Stinner99b95382011-07-04 14:23:54 +02002718#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002719 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2720 PyUnicode_GET_SIZE(unicode),
2721 NULL);
2722#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002724#else
Victor Stinner793b5312011-04-27 00:24:21 +02002725 PyInterpreterState *interp = PyThreadState_GET()->interp;
2726 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2727 cannot use it to encode and decode filenames before it is loaded. Load
2728 the Python codec requires to encode at least its own filename. Use the C
2729 version of the locale codec until the codec registry is initialized and
2730 the Python codec is loaded.
2731
2732 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2733 cannot only rely on it: check also interp->fscodec_initialized for
2734 subinterpreters. */
2735 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002736 return PyUnicode_AsEncodedString(unicode,
2737 Py_FileSystemDefaultEncoding,
2738 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002739 }
2740 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002741 /* locale encoding with surrogateescape */
2742 wchar_t *wchar;
2743 char *bytes;
2744 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002745 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002746
2747 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2748 if (wchar == NULL)
2749 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002750 bytes = _Py_wchar2char(wchar, &error_pos);
2751 if (bytes == NULL) {
2752 if (error_pos != (size_t)-1) {
2753 char *errmsg = strerror(errno);
2754 PyObject *exc = NULL;
2755 if (errmsg == NULL)
2756 errmsg = "Py_wchar2char() failed";
2757 raise_encode_exception(&exc,
2758 "filesystemencoding",
2759 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2760 error_pos, error_pos+1,
2761 errmsg);
2762 Py_XDECREF(exc);
2763 }
2764 else
2765 PyErr_NoMemory();
2766 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002767 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002768 }
2769 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002770
2771 bytes_obj = PyBytes_FromString(bytes);
2772 PyMem_Free(bytes);
2773 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002774 }
Victor Stinnerad158722010-10-27 00:25:46 +00002775#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002776}
2777
Alexander Belopolsky40018472011-02-26 01:02:56 +00002778PyObject *
2779PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002780 const char *encoding,
2781 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782{
2783 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002784 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 if (!PyUnicode_Check(unicode)) {
2787 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 }
Fred Drakee4315f52000-05-09 19:53:39 +00002790
Victor Stinner2f283c22011-03-02 01:21:46 +00002791 if (encoding == NULL) {
2792 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002794 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002796 }
Fred Drakee4315f52000-05-09 19:53:39 +00002797
2798 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002799 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002800 if ((strcmp(lower, "utf-8") == 0) ||
2801 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002802 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002803 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002805 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002807 }
Victor Stinner37296e82010-06-10 13:36:23 +00002808 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002809 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002810 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002812#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002813 else if (strcmp(lower, "mbcs") == 0)
2814 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2815 PyUnicode_GET_SIZE(unicode),
2816 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002817#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002818 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821
2822 /* Encode via the codec registry */
2823 v = PyCodec_Encode(unicode, encoding, errors);
2824 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002825 return NULL;
2826
2827 /* The normal path */
2828 if (PyBytes_Check(v))
2829 return v;
2830
2831 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002832 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002833 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002834 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002835
2836 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2837 "encoder %s returned bytearray instead of bytes",
2838 encoding);
2839 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002840 Py_DECREF(v);
2841 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002842 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002843
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002844 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2845 Py_DECREF(v);
2846 return b;
2847 }
2848
2849 PyErr_Format(PyExc_TypeError,
2850 "encoder did not return a bytes object (type=%.400s)",
2851 Py_TYPE(v)->tp_name);
2852 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002853 return NULL;
2854}
2855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
2857PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002858 const char *encoding,
2859 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002860{
2861 PyObject *v;
2862
2863 if (!PyUnicode_Check(unicode)) {
2864 PyErr_BadArgument();
2865 goto onError;
2866 }
2867
2868 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002870
2871 /* Encode via the codec registry */
2872 v = PyCodec_Encode(unicode, encoding, errors);
2873 if (v == NULL)
2874 goto onError;
2875 if (!PyUnicode_Check(v)) {
2876 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002877 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002878 Py_TYPE(v)->tp_name);
2879 Py_DECREF(v);
2880 goto onError;
2881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002883
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 return NULL;
2886}
2887
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002888PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002889PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002890 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002891 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2892}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002893
Christian Heimes5894ba72007-11-04 11:43:14 +00002894PyObject*
2895PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2896{
Victor Stinner99b95382011-07-04 14:23:54 +02002897#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002898 return PyUnicode_DecodeMBCS(s, size, NULL);
2899#elif defined(__APPLE__)
2900 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2901#else
Victor Stinner793b5312011-04-27 00:24:21 +02002902 PyInterpreterState *interp = PyThreadState_GET()->interp;
2903 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2904 cannot use it to encode and decode filenames before it is loaded. Load
2905 the Python codec requires to encode at least its own filename. Use the C
2906 version of the locale codec until the codec registry is initialized and
2907 the Python codec is loaded.
2908
2909 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2910 cannot only rely on it: check also interp->fscodec_initialized for
2911 subinterpreters. */
2912 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002913 return PyUnicode_Decode(s, size,
2914 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002915 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002916 }
2917 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002918 /* locale encoding with surrogateescape */
2919 wchar_t *wchar;
2920 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002921 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002922
2923 if (s[size] != '\0' || size != strlen(s)) {
2924 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2925 return NULL;
2926 }
2927
Victor Stinner168e1172010-10-16 23:16:16 +00002928 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002929 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002930 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002931
Victor Stinner168e1172010-10-16 23:16:16 +00002932 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002933 PyMem_Free(wchar);
2934 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002935 }
Victor Stinnerad158722010-10-27 00:25:46 +00002936#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002937}
2938
Martin v. Löwis011e8422009-05-05 04:43:17 +00002939
2940int
2941PyUnicode_FSConverter(PyObject* arg, void* addr)
2942{
2943 PyObject *output = NULL;
2944 Py_ssize_t size;
2945 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002946 if (arg == NULL) {
2947 Py_DECREF(*(PyObject**)addr);
2948 return 1;
2949 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002950 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002951 output = arg;
2952 Py_INCREF(output);
2953 }
2954 else {
2955 arg = PyUnicode_FromObject(arg);
2956 if (!arg)
2957 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002958 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002959 Py_DECREF(arg);
2960 if (!output)
2961 return 0;
2962 if (!PyBytes_Check(output)) {
2963 Py_DECREF(output);
2964 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2965 return 0;
2966 }
2967 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002968 size = PyBytes_GET_SIZE(output);
2969 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002970 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002971 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002972 Py_DECREF(output);
2973 return 0;
2974 }
2975 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002976 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002977}
2978
2979
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002980int
2981PyUnicode_FSDecoder(PyObject* arg, void* addr)
2982{
2983 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002984 if (arg == NULL) {
2985 Py_DECREF(*(PyObject**)addr);
2986 return 1;
2987 }
2988 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989 if (PyUnicode_READY(arg))
2990 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002991 output = arg;
2992 Py_INCREF(output);
2993 }
2994 else {
2995 arg = PyBytes_FromObject(arg);
2996 if (!arg)
2997 return 0;
2998 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2999 PyBytes_GET_SIZE(arg));
3000 Py_DECREF(arg);
3001 if (!output)
3002 return 0;
3003 if (!PyUnicode_Check(output)) {
3004 Py_DECREF(output);
3005 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3006 return 0;
3007 }
3008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3010 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003011 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3012 Py_DECREF(output);
3013 return 0;
3014 }
3015 *(PyObject**)addr = output;
3016 return Py_CLEANUP_SUPPORTED;
3017}
3018
3019
Martin v. Löwis5b222132007-06-10 09:51:05 +00003020char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003021PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003022{
Christian Heimesf3863112007-11-22 07:46:41 +00003023 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3025
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 return NULL;
3029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003030 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003031 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003032
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003033 if (PyUnicode_UTF8(unicode) == NULL) {
3034 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003035 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3036 if (bytes == NULL)
3037 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003038 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3039 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003040 Py_DECREF(bytes);
3041 return NULL;
3042 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003043 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3044 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003045 Py_DECREF(bytes);
3046 }
3047
3048 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003049 *psize = PyUnicode_UTF8_LENGTH(unicode);
3050 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003051}
3052
3053char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003056 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3057}
3058
3059#ifdef Py_DEBUG
3060int unicode_as_unicode_calls = 0;
3061#endif
3062
3063
3064Py_UNICODE *
3065PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3066{
3067 PyUnicodeObject *u;
3068 const unsigned char *one_byte;
3069#if SIZEOF_WCHAR_T == 4
3070 const Py_UCS2 *two_bytes;
3071#else
3072 const Py_UCS4 *four_bytes;
3073 const Py_UCS4 *ucs4_end;
3074 Py_ssize_t num_surrogates;
3075#endif
3076 wchar_t *w;
3077 wchar_t *wchar_end;
3078
3079 if (!PyUnicode_Check(unicode)) {
3080 PyErr_BadArgument();
3081 return NULL;
3082 }
3083 u = (PyUnicodeObject*)unicode;
3084 if (_PyUnicode_WSTR(u) == NULL) {
3085 /* Non-ASCII compact unicode object */
3086 assert(_PyUnicode_KIND(u) != 0);
3087 assert(PyUnicode_IS_READY(u));
3088
3089#ifdef Py_DEBUG
3090 ++unicode_as_unicode_calls;
3091#endif
3092
3093 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3094#if SIZEOF_WCHAR_T == 2
3095 four_bytes = PyUnicode_4BYTE_DATA(u);
3096 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3097 num_surrogates = 0;
3098
3099 for (; four_bytes < ucs4_end; ++four_bytes) {
3100 if (*four_bytes > 0xFFFF)
3101 ++num_surrogates;
3102 }
3103
3104 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3105 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3106 if (!_PyUnicode_WSTR(u)) {
3107 PyErr_NoMemory();
3108 return NULL;
3109 }
3110 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3111
3112 w = _PyUnicode_WSTR(u);
3113 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3114 four_bytes = PyUnicode_4BYTE_DATA(u);
3115 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3116 if (*four_bytes > 0xFFFF) {
3117 /* encode surrogate pair in this case */
3118 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3119 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3120 }
3121 else
3122 *w = *four_bytes;
3123
3124 if (w > wchar_end) {
3125 assert(0 && "Miscalculated string end");
3126 }
3127 }
3128 *w = 0;
3129#else
3130 /* sizeof(wchar_t) == 4 */
3131 Py_FatalError("Impossible unicode object state, wstr and str "
3132 "should share memory already.");
3133 return NULL;
3134#endif
3135 }
3136 else {
3137 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3138 (_PyUnicode_LENGTH(u) + 1));
3139 if (!_PyUnicode_WSTR(u)) {
3140 PyErr_NoMemory();
3141 return NULL;
3142 }
3143 if (!PyUnicode_IS_COMPACT_ASCII(u))
3144 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3145 w = _PyUnicode_WSTR(u);
3146 wchar_end = w + _PyUnicode_LENGTH(u);
3147
3148 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3149 one_byte = PyUnicode_1BYTE_DATA(u);
3150 for (; w < wchar_end; ++one_byte, ++w)
3151 *w = *one_byte;
3152 /* null-terminate the wstr */
3153 *w = 0;
3154 }
3155 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3156#if SIZEOF_WCHAR_T == 4
3157 two_bytes = PyUnicode_2BYTE_DATA(u);
3158 for (; w < wchar_end; ++two_bytes, ++w)
3159 *w = *two_bytes;
3160 /* null-terminate the wstr */
3161 *w = 0;
3162#else
3163 /* sizeof(wchar_t) == 2 */
3164 PyObject_FREE(_PyUnicode_WSTR(u));
3165 _PyUnicode_WSTR(u) = NULL;
3166 Py_FatalError("Impossible unicode object state, wstr "
3167 "and str should share memory already.");
3168 return NULL;
3169#endif
3170 }
3171 else {
3172 assert(0 && "This should never happen.");
3173 }
3174 }
3175 }
3176 if (size != NULL)
3177 *size = PyUnicode_WSTR_LENGTH(u);
3178 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003179}
3180
Alexander Belopolsky40018472011-02-26 01:02:56 +00003181Py_UNICODE *
3182PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003184 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185}
3186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003187
Alexander Belopolsky40018472011-02-26 01:02:56 +00003188Py_ssize_t
3189PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190{
3191 if (!PyUnicode_Check(unicode)) {
3192 PyErr_BadArgument();
3193 goto onError;
3194 }
3195 return PyUnicode_GET_SIZE(unicode);
3196
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 return -1;
3199}
3200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003201Py_ssize_t
3202PyUnicode_GetLength(PyObject *unicode)
3203{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003204 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205 PyErr_BadArgument();
3206 return -1;
3207 }
3208
3209 return PyUnicode_GET_LENGTH(unicode);
3210}
3211
3212Py_UCS4
3213PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3214{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003215 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3216 PyErr_BadArgument();
3217 return (Py_UCS4)-1;
3218 }
3219 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3220 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 return (Py_UCS4)-1;
3222 }
3223 return PyUnicode_READ_CHAR(unicode, index);
3224}
3225
3226int
3227PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3228{
3229 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003230 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003231 return -1;
3232 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003233 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3234 PyErr_SetString(PyExc_IndexError, "string index out of range");
3235 return -1;
3236 }
3237 if (_PyUnicode_Dirty(unicode))
3238 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003239 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3240 index, ch);
3241 return 0;
3242}
3243
Alexander Belopolsky40018472011-02-26 01:02:56 +00003244const char *
3245PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003246{
Victor Stinner42cb4622010-09-01 19:39:01 +00003247 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003248}
3249
Victor Stinner554f3f02010-06-16 23:33:54 +00003250/* create or adjust a UnicodeDecodeError */
3251static void
3252make_decode_exception(PyObject **exceptionObject,
3253 const char *encoding,
3254 const char *input, Py_ssize_t length,
3255 Py_ssize_t startpos, Py_ssize_t endpos,
3256 const char *reason)
3257{
3258 if (*exceptionObject == NULL) {
3259 *exceptionObject = PyUnicodeDecodeError_Create(
3260 encoding, input, length, startpos, endpos, reason);
3261 }
3262 else {
3263 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3264 goto onError;
3265 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3266 goto onError;
3267 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3268 goto onError;
3269 }
3270 return;
3271
3272onError:
3273 Py_DECREF(*exceptionObject);
3274 *exceptionObject = NULL;
3275}
3276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277/* error handling callback helper:
3278 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003279 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 and adjust various state variables.
3281 return 0 on success, -1 on error
3282*/
3283
Alexander Belopolsky40018472011-02-26 01:02:56 +00003284static int
3285unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003286 const char *encoding, const char *reason,
3287 const char **input, const char **inend, Py_ssize_t *startinpos,
3288 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3289 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003291 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292
3293 PyObject *restuple = NULL;
3294 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003295 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003297 Py_ssize_t requiredsize;
3298 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003299 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003300 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003301 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 int res = -1;
3303
3304 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 *errorHandler = PyCodec_LookupError(errors);
3306 if (*errorHandler == NULL)
3307 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 }
3309
Victor Stinner554f3f02010-06-16 23:33:54 +00003310 make_decode_exception(exceptionObject,
3311 encoding,
3312 *input, *inend - *input,
3313 *startinpos, *endinpos,
3314 reason);
3315 if (*exceptionObject == NULL)
3316 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317
3318 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3319 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003322 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003323 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 }
3325 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003327
3328 /* Copy back the bytes variables, which might have been modified by the
3329 callback */
3330 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3331 if (!inputobj)
3332 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003333 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003335 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003336 *input = PyBytes_AS_STRING(inputobj);
3337 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003338 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003339 /* we can DECREF safely, as the exception has another reference,
3340 so the object won't go away. */
3341 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003345 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3347 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003348 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349
3350 /* need more space? (at least enough for what we
3351 have+the replacement+the rest of the string (starting
3352 at the new input position), so we won't have to check space
3353 when there are no errors in the rest of the string) */
3354 repptr = PyUnicode_AS_UNICODE(repunicode);
3355 repsize = PyUnicode_GET_SIZE(repunicode);
3356 requiredsize = *outpos + repsize + insize-newpos;
3357 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 if (requiredsize<2*outsize)
3359 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003360 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 goto onError;
3362 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 }
3364 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003365 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_UNICODE_COPY(*outptr, repptr, repsize);
3367 *outptr += repsize;
3368 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 /* we made it! */
3371 res = 0;
3372
Benjamin Peterson29060642009-01-31 22:14:21 +00003373 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 Py_XDECREF(restuple);
3375 return res;
3376}
3377
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003378/* --- UTF-7 Codec -------------------------------------------------------- */
3379
Antoine Pitrou244651a2009-05-04 18:56:13 +00003380/* See RFC2152 for details. We encode conservatively and decode liberally. */
3381
3382/* Three simple macros defining base-64. */
3383
3384/* Is c a base-64 character? */
3385
3386#define IS_BASE64(c) \
3387 (((c) >= 'A' && (c) <= 'Z') || \
3388 ((c) >= 'a' && (c) <= 'z') || \
3389 ((c) >= '0' && (c) <= '9') || \
3390 (c) == '+' || (c) == '/')
3391
3392/* given that c is a base-64 character, what is its base-64 value? */
3393
3394#define FROM_BASE64(c) \
3395 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3396 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3397 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3398 (c) == '+' ? 62 : 63)
3399
3400/* What is the base-64 character of the bottom 6 bits of n? */
3401
3402#define TO_BASE64(n) \
3403 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3404
3405/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3406 * decoded as itself. We are permissive on decoding; the only ASCII
3407 * byte not decoding to itself is the + which begins a base64
3408 * string. */
3409
3410#define DECODE_DIRECT(c) \
3411 ((c) <= 127 && (c) != '+')
3412
3413/* The UTF-7 encoder treats ASCII characters differently according to
3414 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3415 * the above). See RFC2152. This array identifies these different
3416 * sets:
3417 * 0 : "Set D"
3418 * alphanumeric and '(),-./:?
3419 * 1 : "Set O"
3420 * !"#$%&*;<=>@[]^_`{|}
3421 * 2 : "whitespace"
3422 * ht nl cr sp
3423 * 3 : special (must be base64 encoded)
3424 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3425 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003426
Tim Petersced69f82003-09-16 20:30:58 +00003427static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003428char utf7_category[128] = {
3429/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3430 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3431/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3432 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3433/* sp ! " # $ % & ' ( ) * + , - . / */
3434 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3435/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3437/* @ A B C D E F G H I J K L M N O */
3438 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3439/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3441/* ` a b c d e f g h i j k l m n o */
3442 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3443/* p q r s t u v w x y z { | } ~ del */
3444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003445};
3446
Antoine Pitrou244651a2009-05-04 18:56:13 +00003447/* ENCODE_DIRECT: this character should be encoded as itself. The
3448 * answer depends on whether we are encoding set O as itself, and also
3449 * on whether we are encoding whitespace as itself. RFC2152 makes it
3450 * clear that the answers to these questions vary between
3451 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003452
Antoine Pitrou244651a2009-05-04 18:56:13 +00003453#define ENCODE_DIRECT(c, directO, directWS) \
3454 ((c) < 128 && (c) > 0 && \
3455 ((utf7_category[(c)] == 0) || \
3456 (directWS && (utf7_category[(c)] == 2)) || \
3457 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003458
Alexander Belopolsky40018472011-02-26 01:02:56 +00003459PyObject *
3460PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003461 Py_ssize_t size,
3462 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003463{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003464 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3465}
3466
Antoine Pitrou244651a2009-05-04 18:56:13 +00003467/* The decoder. The only state we preserve is our read position,
3468 * i.e. how many characters we have consumed. So if we end in the
3469 * middle of a shift sequence we have to back off the read position
3470 * and the output to the beginning of the sequence, otherwise we lose
3471 * all the shift state (seen bits, number of bits seen, high
3472 * surrogate). */
3473
Alexander Belopolsky40018472011-02-26 01:02:56 +00003474PyObject *
3475PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003476 Py_ssize_t size,
3477 const char *errors,
3478 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003481 Py_ssize_t startinpos;
3482 Py_ssize_t endinpos;
3483 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003484 const char *e;
3485 PyUnicodeObject *unicode;
3486 Py_UNICODE *p;
3487 const char *errmsg = "";
3488 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489 Py_UNICODE *shiftOutStart;
3490 unsigned int base64bits = 0;
3491 unsigned long base64buffer = 0;
3492 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 PyObject *errorHandler = NULL;
3494 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003495
3496 unicode = _PyUnicode_New(size);
3497 if (!unicode)
3498 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003499 if (size == 0) {
3500 if (consumed)
3501 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003502 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003506 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003507 e = s + size;
3508
3509 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003512 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003513
Antoine Pitrou244651a2009-05-04 18:56:13 +00003514 if (inShift) { /* in a base-64 section */
3515 if (IS_BASE64(ch)) { /* consume a base-64 character */
3516 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3517 base64bits += 6;
3518 s++;
3519 if (base64bits >= 16) {
3520 /* we have enough bits for a UTF-16 value */
3521 Py_UNICODE outCh = (Py_UNICODE)
3522 (base64buffer >> (base64bits-16));
3523 base64bits -= 16;
3524 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3525 if (surrogate) {
3526 /* expecting a second surrogate */
3527 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3528#ifdef Py_UNICODE_WIDE
3529 *p++ = (((surrogate & 0x3FF)<<10)
3530 | (outCh & 0x3FF)) + 0x10000;
3531#else
3532 *p++ = surrogate;
3533 *p++ = outCh;
3534#endif
3535 surrogate = 0;
3536 }
3537 else {
3538 surrogate = 0;
3539 errmsg = "second surrogate missing";
3540 goto utf7Error;
3541 }
3542 }
3543 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3544 /* first surrogate */
3545 surrogate = outCh;
3546 }
3547 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3548 errmsg = "unexpected second surrogate";
3549 goto utf7Error;
3550 }
3551 else {
3552 *p++ = outCh;
3553 }
3554 }
3555 }
3556 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003557 inShift = 0;
3558 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559 if (surrogate) {
3560 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003561 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003562 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003563 if (base64bits > 0) { /* left-over bits */
3564 if (base64bits >= 6) {
3565 /* We've seen at least one base-64 character */
3566 errmsg = "partial character in shift sequence";
3567 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003568 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003569 else {
3570 /* Some bits remain; they should be zero */
3571 if (base64buffer != 0) {
3572 errmsg = "non-zero padding bits in shift sequence";
3573 goto utf7Error;
3574 }
3575 }
3576 }
3577 if (ch != '-') {
3578 /* '-' is absorbed; other terminating
3579 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003580 *p++ = ch;
3581 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003582 }
3583 }
3584 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003586 s++; /* consume '+' */
3587 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003588 s++;
3589 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003590 }
3591 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003592 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003593 shiftOutStart = p;
3594 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003595 }
3596 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003597 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003598 *p++ = ch;
3599 s++;
3600 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003601 else {
3602 startinpos = s-starts;
3603 s++;
3604 errmsg = "unexpected special character";
3605 goto utf7Error;
3606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003607 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003608utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 outpos = p-PyUnicode_AS_UNICODE(unicode);
3610 endinpos = s-starts;
3611 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 errors, &errorHandler,
3613 "utf7", errmsg,
3614 &starts, &e, &startinpos, &endinpos, &exc, &s,
3615 &unicode, &outpos, &p))
3616 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003617 }
3618
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 /* end of string */
3620
3621 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3622 /* if we're in an inconsistent state, that's an error */
3623 if (surrogate ||
3624 (base64bits >= 6) ||
3625 (base64bits > 0 && base64buffer != 0)) {
3626 outpos = p-PyUnicode_AS_UNICODE(unicode);
3627 endinpos = size;
3628 if (unicode_decode_call_errorhandler(
3629 errors, &errorHandler,
3630 "utf7", "unterminated shift sequence",
3631 &starts, &e, &startinpos, &endinpos, &exc, &s,
3632 &unicode, &outpos, &p))
3633 goto onError;
3634 if (s < e)
3635 goto restart;
3636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003637 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003638
3639 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003640 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003641 if (inShift) {
3642 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003643 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003644 }
3645 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003646 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003647 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003649
Victor Stinnerfe226c02011-10-03 03:52:20 +02003650 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003651 goto onError;
3652
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 Py_XDECREF(errorHandler);
3654 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003655 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 Py_DECREF(unicode);
3657 return NULL;
3658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003659 return (PyObject *)unicode;
3660
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 Py_XDECREF(errorHandler);
3663 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003664 Py_DECREF(unicode);
3665 return NULL;
3666}
3667
3668
Alexander Belopolsky40018472011-02-26 01:02:56 +00003669PyObject *
3670PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003671 Py_ssize_t size,
3672 int base64SetO,
3673 int base64WhiteSpace,
3674 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003676 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003678 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003679 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003680 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003681 unsigned int base64bits = 0;
3682 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683 char * out;
3684 char * start;
3685
3686 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003689 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003690 return PyErr_NoMemory();
3691
Antoine Pitrou244651a2009-05-04 18:56:13 +00003692 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003693 if (v == NULL)
3694 return NULL;
3695
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003696 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697 for (;i < size; ++i) {
3698 Py_UNICODE ch = s[i];
3699
Antoine Pitrou244651a2009-05-04 18:56:13 +00003700 if (inShift) {
3701 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3702 /* shifting out */
3703 if (base64bits) { /* output remaining bits */
3704 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3705 base64buffer = 0;
3706 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003707 }
3708 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003709 /* Characters not in the BASE64 set implicitly unshift the sequence
3710 so no '-' is required, except if the character is itself a '-' */
3711 if (IS_BASE64(ch) || ch == '-') {
3712 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003713 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003714 *out++ = (char) ch;
3715 }
3716 else {
3717 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003718 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003720 else { /* not in a shift sequence */
3721 if (ch == '+') {
3722 *out++ = '+';
3723 *out++ = '-';
3724 }
3725 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3726 *out++ = (char) ch;
3727 }
3728 else {
3729 *out++ = '+';
3730 inShift = 1;
3731 goto encode_char;
3732 }
3733 }
3734 continue;
3735encode_char:
3736#ifdef Py_UNICODE_WIDE
3737 if (ch >= 0x10000) {
3738 /* code first surrogate */
3739 base64bits += 16;
3740 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3741 while (base64bits >= 6) {
3742 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3743 base64bits -= 6;
3744 }
3745 /* prepare second surrogate */
3746 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3747 }
3748#endif
3749 base64bits += 16;
3750 base64buffer = (base64buffer << 16) | ch;
3751 while (base64bits >= 6) {
3752 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3753 base64bits -= 6;
3754 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003755 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003756 if (base64bits)
3757 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3758 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003760 if (_PyBytes_Resize(&v, out - start) < 0)
3761 return NULL;
3762 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763}
3764
Antoine Pitrou244651a2009-05-04 18:56:13 +00003765#undef IS_BASE64
3766#undef FROM_BASE64
3767#undef TO_BASE64
3768#undef DECODE_DIRECT
3769#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771/* --- UTF-8 Codec -------------------------------------------------------- */
3772
Tim Petersced69f82003-09-16 20:30:58 +00003773static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003775 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3776 illegal prefix. See RFC 3629 for details */
3777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3789 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3790 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3791 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3792 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793};
3794
Alexander Belopolsky40018472011-02-26 01:02:56 +00003795PyObject *
3796PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003797 Py_ssize_t size,
3798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Walter Dörwald69652032004-09-07 20:24:22 +00003800 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3801}
3802
Antoine Pitrouab868312009-01-10 15:40:25 +00003803/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3804#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3805
3806/* Mask to quickly check whether a C 'long' contains a
3807 non-ASCII, UTF8-encoded char. */
3808#if (SIZEOF_LONG == 8)
3809# define ASCII_CHAR_MASK 0x8080808080808080L
3810#elif (SIZEOF_LONG == 4)
3811# define ASCII_CHAR_MASK 0x80808080L
3812#else
3813# error C 'long' size should be either 4 or 8!
3814#endif
3815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816/* Scans a UTF-8 string and returns the maximum character to be expected,
3817 the size of the decoded unicode string and if any major errors were
3818 encountered.
3819
3820 This function does check basic UTF-8 sanity, it does however NOT CHECK
3821 if the string contains surrogates, and if all continuation bytes are
3822 within the correct ranges, these checks are performed in
3823 PyUnicode_DecodeUTF8Stateful.
3824
3825 If it sets has_errors to 1, it means the value of unicode_size and max_char
3826 will be bogus and you should not rely on useful information in them.
3827 */
3828static Py_UCS4
3829utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3830 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3831 int *has_errors)
3832{
3833 Py_ssize_t n;
3834 Py_ssize_t char_count = 0;
3835 Py_UCS4 max_char = 127, new_max;
3836 Py_UCS4 upper_bound;
3837 const unsigned char *p = (const unsigned char *)s;
3838 const unsigned char *end = p + string_size;
3839 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3840 int err = 0;
3841
3842 for (; p < end && !err; ++p, ++char_count) {
3843 /* Only check value if it's not a ASCII char... */
3844 if (*p < 0x80) {
3845 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3846 an explanation. */
3847 if (!((size_t) p & LONG_PTR_MASK)) {
3848 /* Help register allocation */
3849 register const unsigned char *_p = p;
3850 while (_p < aligned_end) {
3851 unsigned long value = *(unsigned long *) _p;
3852 if (value & ASCII_CHAR_MASK)
3853 break;
3854 _p += SIZEOF_LONG;
3855 char_count += SIZEOF_LONG;
3856 }
3857 p = _p;
3858 if (p == end)
3859 break;
3860 }
3861 }
3862 if (*p >= 0x80) {
3863 n = utf8_code_length[*p];
3864 new_max = max_char;
3865 switch (n) {
3866 /* invalid start byte */
3867 case 0:
3868 err = 1;
3869 break;
3870 case 2:
3871 /* Code points between 0x00FF and 0x07FF inclusive.
3872 Approximate the upper bound of the code point,
3873 if this flips over 255 we can be sure it will be more
3874 than 255 and the string will need 2 bytes per code coint,
3875 if it stays under or equal to 255, we can be sure 1 byte
3876 is enough.
3877 ((*p & 0b00011111) << 6) | 0b00111111 */
3878 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3879 if (max_char < upper_bound)
3880 new_max = upper_bound;
3881 /* Ensure we track at least that we left ASCII space. */
3882 if (new_max < 128)
3883 new_max = 128;
3884 break;
3885 case 3:
3886 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3887 always > 255 and <= 65535 and will always need 2 bytes. */
3888 if (max_char < 65535)
3889 new_max = 65535;
3890 break;
3891 case 4:
3892 /* Code point will be above 0xFFFF for sure in this case. */
3893 new_max = 65537;
3894 break;
3895 /* Internal error, this should be caught by the first if */
3896 case 1:
3897 default:
3898 assert(0 && "Impossible case in utf8_max_char_and_size");
3899 err = 1;
3900 }
3901 /* Instead of number of overall bytes for this code point,
3902 n containts the number of following bytes: */
3903 --n;
3904 /* Check if the follow up chars are all valid continuation bytes */
3905 if (n >= 1) {
3906 const unsigned char *cont;
3907 if ((p + n) >= end) {
3908 if (consumed == 0)
3909 /* incomplete data, non-incremental decoding */
3910 err = 1;
3911 break;
3912 }
3913 for (cont = p + 1; cont < (p + n); ++cont) {
3914 if ((*cont & 0xc0) != 0x80) {
3915 err = 1;
3916 break;
3917 }
3918 }
3919 p += n;
3920 }
3921 else
3922 err = 1;
3923 max_char = new_max;
3924 }
3925 }
3926
3927 if (unicode_size)
3928 *unicode_size = char_count;
3929 if (has_errors)
3930 *has_errors = err;
3931 return max_char;
3932}
3933
3934/* Similar to PyUnicode_WRITE but can also write into wstr field
3935 of the legacy unicode representation */
3936#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3937 do { \
3938 const int k_ = (kind); \
3939 if (k_ == PyUnicode_WCHAR_KIND) \
3940 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3941 else if (k_ == PyUnicode_1BYTE_KIND) \
3942 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3943 else if (k_ == PyUnicode_2BYTE_KIND) \
3944 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3945 else \
3946 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3947 } while (0)
3948
Alexander Belopolsky40018472011-02-26 01:02:56 +00003949PyObject *
3950PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 Py_ssize_t size,
3952 const char *errors,
3953 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003957 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t startinpos;
3959 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003960 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003962 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 PyObject *errorHandler = NULL;
3964 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 Py_UCS4 maxchar = 0;
3966 Py_ssize_t unicode_size;
3967 Py_ssize_t i;
3968 int kind;
3969 void *data;
3970 int has_errors;
3971 Py_UNICODE *error_outptr;
3972#if SIZEOF_WCHAR_T == 2
3973 Py_ssize_t wchar_offset = 0;
3974#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
Walter Dörwald69652032004-09-07 20:24:22 +00003976 if (size == 0) {
3977 if (consumed)
3978 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3982 consumed, &has_errors);
3983 if (has_errors) {
3984 unicode = _PyUnicode_New(size);
3985 if (!unicode)
3986 return NULL;
3987 kind = PyUnicode_WCHAR_KIND;
3988 data = PyUnicode_AS_UNICODE(unicode);
3989 assert(data != NULL);
3990 }
3991 else {
3992 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3993 if (!unicode)
3994 return NULL;
3995 /* When the string is ASCII only, just use memcpy and return.
3996 unicode_size may be != size if there is an incomplete UTF-8
3997 sequence at the end of the ASCII block. */
3998 if (maxchar < 128 && size == unicode_size) {
3999 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4000 return (PyObject *)unicode;
4001 }
4002 kind = PyUnicode_KIND(unicode);
4003 data = PyUnicode_DATA(unicode);
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004008 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
4010 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004011 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012
4013 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004014 /* Fast path for runs of ASCII characters. Given that common UTF-8
4015 input will consist of an overwhelming majority of ASCII
4016 characters, we try to optimize for this case by checking
4017 as many characters as a C 'long' can contain.
4018 First, check if we can do an aligned read, as most CPUs have
4019 a penalty for unaligned reads.
4020 */
4021 if (!((size_t) s & LONG_PTR_MASK)) {
4022 /* Help register allocation */
4023 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004025 while (_s < aligned_end) {
4026 /* Read a whole long at a time (either 4 or 8 bytes),
4027 and do a fast unrolled copy if it only contains ASCII
4028 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 unsigned long value = *(unsigned long *) _s;
4030 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004031 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4033 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4034 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4035 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004036#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4038 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4039 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4040 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004041#endif
4042 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004044 }
4045 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004047 if (s == e)
4048 break;
4049 ch = (unsigned char)*s;
4050 }
4051 }
4052
4053 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 s++;
4056 continue;
4057 }
4058
4059 n = utf8_code_length[ch];
4060
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004061 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 if (consumed)
4063 break;
4064 else {
4065 errmsg = "unexpected end of data";
4066 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004067 endinpos = startinpos+1;
4068 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4069 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 goto utf8Error;
4071 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073
4074 switch (n) {
4075
4076 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004077 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 startinpos = s-starts;
4079 endinpos = startinpos+1;
4080 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081
4082 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004083 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 startinpos = s-starts;
4085 endinpos = startinpos+1;
4086 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
4088 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004089 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004090 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004092 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 goto utf8Error;
4094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004096 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 break;
4099
4100 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004101 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4102 will result in surrogates in range d800-dfff. Surrogates are
4103 not valid UTF-8 so they are rejected.
4104 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4105 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004106 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 (s[2] & 0xc0) != 0x80 ||
4108 ((unsigned char)s[0] == 0xE0 &&
4109 (unsigned char)s[1] < 0xA0) ||
4110 ((unsigned char)s[0] == 0xED &&
4111 (unsigned char)s[1] > 0x9F)) {
4112 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004114 endinpos = startinpos + 1;
4115
4116 /* if s[1] first two bits are 1 and 0, then the invalid
4117 continuation byte is s[2], so increment endinpos by 1,
4118 if not, s[1] is invalid and endinpos doesn't need to
4119 be incremented. */
4120 if ((s[1] & 0xC0) == 0x80)
4121 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 goto utf8Error;
4123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004125 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004127 break;
4128
4129 case 4:
4130 if ((s[1] & 0xc0) != 0x80 ||
4131 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004132 (s[3] & 0xc0) != 0x80 ||
4133 ((unsigned char)s[0] == 0xF0 &&
4134 (unsigned char)s[1] < 0x90) ||
4135 ((unsigned char)s[0] == 0xF4 &&
4136 (unsigned char)s[1] > 0x8F)) {
4137 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004139 endinpos = startinpos + 1;
4140 if ((s[1] & 0xC0) == 0x80) {
4141 endinpos++;
4142 if ((s[2] & 0xC0) == 0x80)
4143 endinpos++;
4144 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 goto utf8Error;
4146 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004147 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004148 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4149 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 /* If the string is flexible or we have native UCS-4, write
4152 directly.. */
4153 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4154 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004156 else {
4157 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 /* translate from 10000..10FFFF to 0..FFFF */
4160 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 /* high surrogate = top 10 bits added to D800 */
4163 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4164 (Py_UNICODE)(0xD800 + (ch >> 10)));
4165
4166 /* low surrogate = bottom 10 bits added to DC00 */
4167 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4168 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4169 }
4170#if SIZEOF_WCHAR_T == 2
4171 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004172#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 }
4175 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 /* If this is not yet a resizable string, make it one.. */
4180 if (kind != PyUnicode_WCHAR_KIND) {
4181 const Py_UNICODE *u;
4182 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4183 if (!new_unicode)
4184 goto onError;
4185 u = PyUnicode_AsUnicode((PyObject *)unicode);
4186 if (!u)
4187 goto onError;
4188#if SIZEOF_WCHAR_T == 2
4189 i += wchar_offset;
4190#endif
4191 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4192 Py_DECREF(unicode);
4193 unicode = new_unicode;
4194 kind = 0;
4195 data = PyUnicode_AS_UNICODE(new_unicode);
4196 assert(data != NULL);
4197 }
4198 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 if (unicode_decode_call_errorhandler(
4200 errors, &errorHandler,
4201 "utf8", errmsg,
4202 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004203 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 /* Update data because unicode_decode_call_errorhandler might have
4206 re-created or resized the unicode object. */
4207 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 /* Ensure the unicode_size calculation above was correct: */
4211 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4212
Walter Dörwald69652032004-09-07 20:24:22 +00004213 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004214 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004216 /* Adjust length and ready string when it contained errors and
4217 is of the old resizable kind. */
4218 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004219 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 goto onError;
4221 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 Py_XDECREF(errorHandler);
4224 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004225 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 Py_DECREF(unicode);
4227 return NULL;
4228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return (PyObject *)unicode;
4230
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 Py_XDECREF(errorHandler);
4233 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234 Py_DECREF(unicode);
4235 return NULL;
4236}
4237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004239
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004240#ifdef __APPLE__
4241
4242/* Simplified UTF-8 decoder using surrogateescape error handler,
4243 used to decode the command line arguments on Mac OS X. */
4244
4245wchar_t*
4246_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4247{
4248 int n;
4249 const char *e;
4250 wchar_t *unicode, *p;
4251
4252 /* Note: size will always be longer than the resulting Unicode
4253 character count */
4254 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4255 PyErr_NoMemory();
4256 return NULL;
4257 }
4258 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4259 if (!unicode)
4260 return NULL;
4261
4262 /* Unpack UTF-8 encoded data */
4263 p = unicode;
4264 e = s + size;
4265 while (s < e) {
4266 Py_UCS4 ch = (unsigned char)*s;
4267
4268 if (ch < 0x80) {
4269 *p++ = (wchar_t)ch;
4270 s++;
4271 continue;
4272 }
4273
4274 n = utf8_code_length[ch];
4275 if (s + n > e) {
4276 goto surrogateescape;
4277 }
4278
4279 switch (n) {
4280 case 0:
4281 case 1:
4282 goto surrogateescape;
4283
4284 case 2:
4285 if ((s[1] & 0xc0) != 0x80)
4286 goto surrogateescape;
4287 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4288 assert ((ch > 0x007F) && (ch <= 0x07FF));
4289 *p++ = (wchar_t)ch;
4290 break;
4291
4292 case 3:
4293 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4294 will result in surrogates in range d800-dfff. Surrogates are
4295 not valid UTF-8 so they are rejected.
4296 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4297 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4298 if ((s[1] & 0xc0) != 0x80 ||
4299 (s[2] & 0xc0) != 0x80 ||
4300 ((unsigned char)s[0] == 0xE0 &&
4301 (unsigned char)s[1] < 0xA0) ||
4302 ((unsigned char)s[0] == 0xED &&
4303 (unsigned char)s[1] > 0x9F)) {
4304
4305 goto surrogateescape;
4306 }
4307 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4308 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004310 break;
4311
4312 case 4:
4313 if ((s[1] & 0xc0) != 0x80 ||
4314 (s[2] & 0xc0) != 0x80 ||
4315 (s[3] & 0xc0) != 0x80 ||
4316 ((unsigned char)s[0] == 0xF0 &&
4317 (unsigned char)s[1] < 0x90) ||
4318 ((unsigned char)s[0] == 0xF4 &&
4319 (unsigned char)s[1] > 0x8F)) {
4320 goto surrogateescape;
4321 }
4322 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4323 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4324 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4325
4326#if SIZEOF_WCHAR_T == 4
4327 *p++ = (wchar_t)ch;
4328#else
4329 /* compute and append the two surrogates: */
4330
4331 /* translate from 10000..10FFFF to 0..FFFF */
4332 ch -= 0x10000;
4333
4334 /* high surrogate = top 10 bits added to D800 */
4335 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4336
4337 /* low surrogate = bottom 10 bits added to DC00 */
4338 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4339#endif
4340 break;
4341 }
4342 s += n;
4343 continue;
4344
4345 surrogateescape:
4346 *p++ = 0xDC00 + ch;
4347 s++;
4348 }
4349 *p = L'\0';
4350 return unicode;
4351}
4352
4353#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004355/* Primary internal function which creates utf8 encoded bytes objects.
4356
4357 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004358 and allocate exactly as much space needed at the end. Else allocate the
4359 maximum possible needed (4 result bytes per Unicode character), and return
4360 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004361*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004362PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364{
Tim Peters602f7402002-04-27 18:03:26 +00004365#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004366
Guido van Rossum98297ee2007-11-06 21:34:58 +00004367 Py_ssize_t i; /* index into s of next input byte */
4368 PyObject *result; /* result string object */
4369 char *p; /* next free byte in output buffer */
4370 Py_ssize_t nallocated; /* number of result bytes allocated */
4371 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004372 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004373 PyObject *errorHandler = NULL;
4374 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004375 int kind;
4376 void *data;
4377 Py_ssize_t size;
4378 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4379#if SIZEOF_WCHAR_T == 2
4380 Py_ssize_t wchar_offset = 0;
4381#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 if (!PyUnicode_Check(unicode)) {
4384 PyErr_BadArgument();
4385 return NULL;
4386 }
4387
4388 if (PyUnicode_READY(unicode) == -1)
4389 return NULL;
4390
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004391 if (PyUnicode_UTF8(unicode))
4392 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4393 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004394
4395 kind = PyUnicode_KIND(unicode);
4396 data = PyUnicode_DATA(unicode);
4397 size = PyUnicode_GET_LENGTH(unicode);
4398
Tim Peters602f7402002-04-27 18:03:26 +00004399 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400
Tim Peters602f7402002-04-27 18:03:26 +00004401 if (size <= MAX_SHORT_UNICHARS) {
4402 /* Write into the stack buffer; nallocated can't overflow.
4403 * At the end, we'll allocate exactly as much heap space as it
4404 * turns out we need.
4405 */
4406 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004407 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004408 p = stackbuf;
4409 }
4410 else {
4411 /* Overallocate on the heap, and give the excess back at the end. */
4412 nallocated = size * 4;
4413 if (nallocated / 4 != size) /* overflow! */
4414 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004415 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004416 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004417 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004418 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004419 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004420
Tim Peters602f7402002-04-27 18:03:26 +00004421 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004422 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004423
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004424 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004425 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004427
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004429 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004430 *p++ = (char)(0xc0 | (ch >> 6));
4431 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004432 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433 Py_ssize_t newpos;
4434 PyObject *rep;
4435 Py_ssize_t repsize, k, startpos;
4436 startpos = i-1;
4437#if SIZEOF_WCHAR_T == 2
4438 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004439#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 rep = unicode_encode_call_errorhandler(
4441 errors, &errorHandler, "utf-8", "surrogates not allowed",
4442 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4443 &exc, startpos, startpos+1, &newpos);
4444 if (!rep)
4445 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447 if (PyBytes_Check(rep))
4448 repsize = PyBytes_GET_SIZE(rep);
4449 else
4450 repsize = PyUnicode_GET_SIZE(rep);
4451
4452 if (repsize > 4) {
4453 Py_ssize_t offset;
4454
4455 if (result == NULL)
4456 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004457 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004458 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4461 /* integer overflow */
4462 PyErr_NoMemory();
4463 goto error;
4464 }
4465 nallocated += repsize - 4;
4466 if (result != NULL) {
4467 if (_PyBytes_Resize(&result, nallocated) < 0)
4468 goto error;
4469 } else {
4470 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004471 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 goto error;
4473 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4474 }
4475 p = PyBytes_AS_STRING(result) + offset;
4476 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 if (PyBytes_Check(rep)) {
4479 char *prep = PyBytes_AS_STRING(rep);
4480 for(k = repsize; k > 0; k--)
4481 *p++ = *prep++;
4482 } else /* rep is unicode */ {
4483 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4484 Py_UNICODE c;
4485
4486 for(k=0; k<repsize; k++) {
4487 c = prep[k];
4488 if (0x80 <= c) {
4489 raise_encode_exception(&exc, "utf-8",
4490 PyUnicode_AS_UNICODE(unicode),
4491 size, i-1, i,
4492 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004493 goto error;
4494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004495 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004496 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004498 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004499 } else if (ch < 0x10000) {
4500 *p++ = (char)(0xe0 | (ch >> 12));
4501 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4502 *p++ = (char)(0x80 | (ch & 0x3f));
4503 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004504 /* Encode UCS4 Unicode ordinals */
4505 *p++ = (char)(0xf0 | (ch >> 18));
4506 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4507 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4508 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509#if SIZEOF_WCHAR_T == 2
4510 wchar_offset++;
4511#endif
Tim Peters602f7402002-04-27 18:03:26 +00004512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004514
Guido van Rossum98297ee2007-11-06 21:34:58 +00004515 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004516 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004517 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004518 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004519 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004520 }
4521 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004522 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004523 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004524 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004525 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004528 Py_XDECREF(errorHandler);
4529 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004530 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004531 error:
4532 Py_XDECREF(errorHandler);
4533 Py_XDECREF(exc);
4534 Py_XDECREF(result);
4535 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004536
Tim Peters602f7402002-04-27 18:03:26 +00004537#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538}
4539
Alexander Belopolsky40018472011-02-26 01:02:56 +00004540PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004541PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4542 Py_ssize_t size,
4543 const char *errors)
4544{
4545 PyObject *v, *unicode;
4546
4547 unicode = PyUnicode_FromUnicode(s, size);
4548 if (unicode == NULL)
4549 return NULL;
4550 v = _PyUnicode_AsUTF8String(unicode, errors);
4551 Py_DECREF(unicode);
4552 return v;
4553}
4554
4555PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004556PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004558 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559}
4560
Walter Dörwald41980ca2007-08-16 21:55:45 +00004561/* --- UTF-32 Codec ------------------------------------------------------- */
4562
4563PyObject *
4564PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 Py_ssize_t size,
4566 const char *errors,
4567 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004568{
4569 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4570}
4571
4572PyObject *
4573PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 Py_ssize_t size,
4575 const char *errors,
4576 int *byteorder,
4577 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004578{
4579 const char *starts = s;
4580 Py_ssize_t startinpos;
4581 Py_ssize_t endinpos;
4582 Py_ssize_t outpos;
4583 PyUnicodeObject *unicode;
4584 Py_UNICODE *p;
4585#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004586 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004587 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004588#else
4589 const int pairs = 0;
4590#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004591 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004592 int bo = 0; /* assume native ordering by default */
4593 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004594 /* Offsets from q for retrieving bytes in the right order. */
4595#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4596 int iorder[] = {0, 1, 2, 3};
4597#else
4598 int iorder[] = {3, 2, 1, 0};
4599#endif
4600 PyObject *errorHandler = NULL;
4601 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004602
Walter Dörwald41980ca2007-08-16 21:55:45 +00004603 q = (unsigned char *)s;
4604 e = q + size;
4605
4606 if (byteorder)
4607 bo = *byteorder;
4608
4609 /* Check for BOM marks (U+FEFF) in the input and adjust current
4610 byte order setting accordingly. In native mode, the leading BOM
4611 mark is skipped, in all other modes, it is copied to the output
4612 stream as-is (giving a ZWNBSP character). */
4613 if (bo == 0) {
4614 if (size >= 4) {
4615 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004617#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 if (bom == 0x0000FEFF) {
4619 q += 4;
4620 bo = -1;
4621 }
4622 else if (bom == 0xFFFE0000) {
4623 q += 4;
4624 bo = 1;
4625 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004626#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 if (bom == 0x0000FEFF) {
4628 q += 4;
4629 bo = 1;
4630 }
4631 else if (bom == 0xFFFE0000) {
4632 q += 4;
4633 bo = -1;
4634 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004637 }
4638
4639 if (bo == -1) {
4640 /* force LE */
4641 iorder[0] = 0;
4642 iorder[1] = 1;
4643 iorder[2] = 2;
4644 iorder[3] = 3;
4645 }
4646 else if (bo == 1) {
4647 /* force BE */
4648 iorder[0] = 3;
4649 iorder[1] = 2;
4650 iorder[2] = 1;
4651 iorder[3] = 0;
4652 }
4653
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004654 /* On narrow builds we split characters outside the BMP into two
4655 codepoints => count how much extra space we need. */
4656#ifndef Py_UNICODE_WIDE
4657 for (qq = q; qq < e; qq += 4)
4658 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4659 pairs++;
4660#endif
4661
4662 /* This might be one to much, because of a BOM */
4663 unicode = _PyUnicode_New((size+3)/4+pairs);
4664 if (!unicode)
4665 return NULL;
4666 if (size == 0)
4667 return (PyObject *)unicode;
4668
4669 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004670 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004671
Walter Dörwald41980ca2007-08-16 21:55:45 +00004672 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 Py_UCS4 ch;
4674 /* remaining bytes at the end? (size should be divisible by 4) */
4675 if (e-q<4) {
4676 if (consumed)
4677 break;
4678 errmsg = "truncated data";
4679 startinpos = ((const char *)q)-starts;
4680 endinpos = ((const char *)e)-starts;
4681 goto utf32Error;
4682 /* The remaining input chars are ignored if the callback
4683 chooses to skip the input */
4684 }
4685 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4686 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004687
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 if (ch >= 0x110000)
4689 {
4690 errmsg = "codepoint not in range(0x110000)";
4691 startinpos = ((const char *)q)-starts;
4692 endinpos = startinpos+4;
4693 goto utf32Error;
4694 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004695#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 if (ch >= 0x10000)
4697 {
4698 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4699 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4700 }
4701 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004702#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 *p++ = ch;
4704 q += 4;
4705 continue;
4706 utf32Error:
4707 outpos = p-PyUnicode_AS_UNICODE(unicode);
4708 if (unicode_decode_call_errorhandler(
4709 errors, &errorHandler,
4710 "utf32", errmsg,
4711 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4712 &unicode, &outpos, &p))
4713 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004714 }
4715
4716 if (byteorder)
4717 *byteorder = bo;
4718
4719 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004721
4722 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004723 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724 goto onError;
4725
4726 Py_XDECREF(errorHandler);
4727 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004728 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 Py_DECREF(unicode);
4730 return NULL;
4731 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732 return (PyObject *)unicode;
4733
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004735 Py_DECREF(unicode);
4736 Py_XDECREF(errorHandler);
4737 Py_XDECREF(exc);
4738 return NULL;
4739}
4740
4741PyObject *
4742PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 Py_ssize_t size,
4744 const char *errors,
4745 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004747 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004748 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004749 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004750#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004751 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752#else
4753 const int pairs = 0;
4754#endif
4755 /* Offsets from p for storing byte pairs in the right order. */
4756#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4757 int iorder[] = {0, 1, 2, 3};
4758#else
4759 int iorder[] = {3, 2, 1, 0};
4760#endif
4761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762#define STORECHAR(CH) \
4763 do { \
4764 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4765 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4766 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4767 p[iorder[0]] = (CH) & 0xff; \
4768 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004769 } while(0)
4770
4771 /* In narrow builds we can output surrogate pairs as one codepoint,
4772 so we need less space. */
4773#ifndef Py_UNICODE_WIDE
4774 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4776 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4777 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004778#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004779 nsize = (size - pairs + (byteorder == 0));
4780 bytesize = nsize * 4;
4781 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004783 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004784 if (v == NULL)
4785 return NULL;
4786
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004787 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004790 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004792
4793 if (byteorder == -1) {
4794 /* force LE */
4795 iorder[0] = 0;
4796 iorder[1] = 1;
4797 iorder[2] = 2;
4798 iorder[3] = 3;
4799 }
4800 else if (byteorder == 1) {
4801 /* force BE */
4802 iorder[0] = 3;
4803 iorder[1] = 2;
4804 iorder[2] = 1;
4805 iorder[3] = 0;
4806 }
4807
4808 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4812 Py_UCS4 ch2 = *s;
4813 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4814 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4815 s++;
4816 size--;
4817 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004818 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004819#endif
4820 STORECHAR(ch);
4821 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004822
4823 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004824 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004825#undef STORECHAR
4826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
4829PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004830{
4831 if (!PyUnicode_Check(unicode)) {
4832 PyErr_BadArgument();
4833 return NULL;
4834 }
4835 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 PyUnicode_GET_SIZE(unicode),
4837 NULL,
4838 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839}
4840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841/* --- UTF-16 Codec ------------------------------------------------------- */
4842
Tim Peters772747b2001-08-09 22:21:55 +00004843PyObject *
4844PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 Py_ssize_t size,
4846 const char *errors,
4847 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848{
Walter Dörwald69652032004-09-07 20:24:22 +00004849 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4850}
4851
Antoine Pitrouab868312009-01-10 15:40:25 +00004852/* Two masks for fast checking of whether a C 'long' may contain
4853 UTF16-encoded surrogate characters. This is an efficient heuristic,
4854 assuming that non-surrogate characters with a code point >= 0x8000 are
4855 rare in most input.
4856 FAST_CHAR_MASK is used when the input is in native byte ordering,
4857 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004858*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004859#if (SIZEOF_LONG == 8)
4860# define FAST_CHAR_MASK 0x8000800080008000L
4861# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4862#elif (SIZEOF_LONG == 4)
4863# define FAST_CHAR_MASK 0x80008000L
4864# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4865#else
4866# error C 'long' size should be either 4 or 8!
4867#endif
4868
Walter Dörwald69652032004-09-07 20:24:22 +00004869PyObject *
4870PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 Py_ssize_t size,
4872 const char *errors,
4873 int *byteorder,
4874 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t startinpos;
4878 Py_ssize_t endinpos;
4879 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 PyUnicodeObject *unicode;
4881 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004882 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004883 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004884 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004885 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004886 /* Offsets from q for retrieving byte pairs in the right order. */
4887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4888 int ihi = 1, ilo = 0;
4889#else
4890 int ihi = 0, ilo = 1;
4891#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 PyObject *errorHandler = NULL;
4893 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894
4895 /* Note: size will always be longer than the resulting Unicode
4896 character count */
4897 unicode = _PyUnicode_New(size);
4898 if (!unicode)
4899 return NULL;
4900 if (size == 0)
4901 return (PyObject *)unicode;
4902
4903 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004905 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004906 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907
4908 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004909 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004911 /* Check for BOM marks (U+FEFF) in the input and adjust current
4912 byte order setting accordingly. In native mode, the leading BOM
4913 mark is skipped, in all other modes, it is copied to the output
4914 stream as-is (giving a ZWNBSP character). */
4915 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004916 if (size >= 2) {
4917 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004918#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 if (bom == 0xFEFF) {
4920 q += 2;
4921 bo = -1;
4922 }
4923 else if (bom == 0xFFFE) {
4924 q += 2;
4925 bo = 1;
4926 }
Tim Petersced69f82003-09-16 20:30:58 +00004927#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 if (bom == 0xFEFF) {
4929 q += 2;
4930 bo = 1;
4931 }
4932 else if (bom == 0xFFFE) {
4933 q += 2;
4934 bo = -1;
4935 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004936#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939
Tim Peters772747b2001-08-09 22:21:55 +00004940 if (bo == -1) {
4941 /* force LE */
4942 ihi = 1;
4943 ilo = 0;
4944 }
4945 else if (bo == 1) {
4946 /* force BE */
4947 ihi = 0;
4948 ilo = 1;
4949 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004950#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4951 native_ordering = ilo < ihi;
4952#else
4953 native_ordering = ilo > ihi;
4954#endif
Tim Peters772747b2001-08-09 22:21:55 +00004955
Antoine Pitrouab868312009-01-10 15:40:25 +00004956 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004957 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004959 /* First check for possible aligned read of a C 'long'. Unaligned
4960 reads are more expensive, better to defer to another iteration. */
4961 if (!((size_t) q & LONG_PTR_MASK)) {
4962 /* Fast path for runs of non-surrogate chars. */
4963 register const unsigned char *_q = q;
4964 Py_UNICODE *_p = p;
4965 if (native_ordering) {
4966 /* Native ordering is simple: as long as the input cannot
4967 possibly contain a surrogate char, do an unrolled copy
4968 of several 16-bit code points to the target object.
4969 The non-surrogate check is done on several input bytes
4970 at a time (as many as a C 'long' can contain). */
4971 while (_q < aligned_end) {
4972 unsigned long data = * (unsigned long *) _q;
4973 if (data & FAST_CHAR_MASK)
4974 break;
4975 _p[0] = ((unsigned short *) _q)[0];
4976 _p[1] = ((unsigned short *) _q)[1];
4977#if (SIZEOF_LONG == 8)
4978 _p[2] = ((unsigned short *) _q)[2];
4979 _p[3] = ((unsigned short *) _q)[3];
4980#endif
4981 _q += SIZEOF_LONG;
4982 _p += SIZEOF_LONG / 2;
4983 }
4984 }
4985 else {
4986 /* Byteswapped ordering is similar, but we must decompose
4987 the copy bytewise, and take care of zero'ing out the
4988 upper bytes if the target object is in 32-bit units
4989 (that is, in UCS-4 builds). */
4990 while (_q < aligned_end) {
4991 unsigned long data = * (unsigned long *) _q;
4992 if (data & SWAPPED_FAST_CHAR_MASK)
4993 break;
4994 /* Zero upper bytes in UCS-4 builds */
4995#if (Py_UNICODE_SIZE > 2)
4996 _p[0] = 0;
4997 _p[1] = 0;
4998#if (SIZEOF_LONG == 8)
4999 _p[2] = 0;
5000 _p[3] = 0;
5001#endif
5002#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005003 /* Issue #4916; UCS-4 builds on big endian machines must
5004 fill the two last bytes of each 4-byte unit. */
5005#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5006# define OFF 2
5007#else
5008# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005009#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005010 ((unsigned char *) _p)[OFF + 1] = _q[0];
5011 ((unsigned char *) _p)[OFF + 0] = _q[1];
5012 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5013 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5014#if (SIZEOF_LONG == 8)
5015 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5016 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5017 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5018 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5019#endif
5020#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005021 _q += SIZEOF_LONG;
5022 _p += SIZEOF_LONG / 2;
5023 }
5024 }
5025 p = _p;
5026 q = _q;
5027 if (q >= e)
5028 break;
5029 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031
Benjamin Peterson14339b62009-01-31 16:36:08 +00005032 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005033
5034 if (ch < 0xD800 || ch > 0xDFFF) {
5035 *p++ = ch;
5036 continue;
5037 }
5038
5039 /* UTF-16 code pair: */
5040 if (q > e) {
5041 errmsg = "unexpected end of data";
5042 startinpos = (((const char *)q) - 2) - starts;
5043 endinpos = ((const char *)e) + 1 - starts;
5044 goto utf16Error;
5045 }
5046 if (0xD800 <= ch && ch <= 0xDBFF) {
5047 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5048 q += 2;
5049 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005050#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 *p++ = ch;
5052 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005053#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005055#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 continue;
5057 }
5058 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005059 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 startinpos = (((const char *)q)-4)-starts;
5061 endinpos = startinpos+2;
5062 goto utf16Error;
5063 }
5064
Benjamin Peterson14339b62009-01-31 16:36:08 +00005065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 errmsg = "illegal encoding";
5067 startinpos = (((const char *)q)-2)-starts;
5068 endinpos = startinpos+2;
5069 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005070
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 utf16Error:
5072 outpos = p - PyUnicode_AS_UNICODE(unicode);
5073 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005074 errors,
5075 &errorHandler,
5076 "utf16", errmsg,
5077 &starts,
5078 (const char **)&e,
5079 &startinpos,
5080 &endinpos,
5081 &exc,
5082 (const char **)&q,
5083 &unicode,
5084 &outpos,
5085 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005088 /* remaining byte at the end? (size should be even) */
5089 if (e == q) {
5090 if (!consumed) {
5091 errmsg = "truncated data";
5092 startinpos = ((const char *)q) - starts;
5093 endinpos = ((const char *)e) + 1 - starts;
5094 outpos = p - PyUnicode_AS_UNICODE(unicode);
5095 if (unicode_decode_call_errorhandler(
5096 errors,
5097 &errorHandler,
5098 "utf16", errmsg,
5099 &starts,
5100 (const char **)&e,
5101 &startinpos,
5102 &endinpos,
5103 &exc,
5104 (const char **)&q,
5105 &unicode,
5106 &outpos,
5107 &p))
5108 goto onError;
5109 /* The remaining input chars are ignored if the callback
5110 chooses to skip the input */
5111 }
5112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113
5114 if (byteorder)
5115 *byteorder = bo;
5116
Walter Dörwald69652032004-09-07 20:24:22 +00005117 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005119
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005121 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 goto onError;
5123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 Py_XDECREF(errorHandler);
5125 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005126 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005127 Py_DECREF(unicode);
5128 return NULL;
5129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 return (PyObject *)unicode;
5131
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 Py_XDECREF(errorHandler);
5135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 return NULL;
5137}
5138
Antoine Pitrouab868312009-01-10 15:40:25 +00005139#undef FAST_CHAR_MASK
5140#undef SWAPPED_FAST_CHAR_MASK
5141
Tim Peters772747b2001-08-09 22:21:55 +00005142PyObject *
5143PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 Py_ssize_t size,
5145 const char *errors,
5146 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005148 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005149 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005150 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005151#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005152 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005153#else
5154 const int pairs = 0;
5155#endif
Tim Peters772747b2001-08-09 22:21:55 +00005156 /* Offsets from p for storing byte pairs in the right order. */
5157#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5158 int ihi = 1, ilo = 0;
5159#else
5160 int ihi = 0, ilo = 1;
5161#endif
5162
Benjamin Peterson29060642009-01-31 22:14:21 +00005163#define STORECHAR(CH) \
5164 do { \
5165 p[ihi] = ((CH) >> 8) & 0xff; \
5166 p[ilo] = (CH) & 0xff; \
5167 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005168 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005170#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005171 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 if (s[i] >= 0x10000)
5173 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005174#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005175 /* 2 * (size + pairs + (byteorder == 0)) */
5176 if (size > PY_SSIZE_T_MAX ||
5177 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005179 nsize = size + pairs + (byteorder == 0);
5180 bytesize = nsize * 2;
5181 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005183 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 if (v == NULL)
5185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005187 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005190 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005191 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005192
5193 if (byteorder == -1) {
5194 /* force LE */
5195 ihi = 1;
5196 ilo = 0;
5197 }
5198 else if (byteorder == 1) {
5199 /* force BE */
5200 ihi = 0;
5201 ilo = 1;
5202 }
5203
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005204 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 Py_UNICODE ch = *s++;
5206 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005207#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 if (ch >= 0x10000) {
5209 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5210 ch = 0xD800 | ((ch-0x10000) >> 10);
5211 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005212#endif
Tim Peters772747b2001-08-09 22:21:55 +00005213 STORECHAR(ch);
5214 if (ch2)
5215 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005216 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005217
5218 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005219 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005220#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221}
5222
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyObject *
5224PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
5226 if (!PyUnicode_Check(unicode)) {
5227 PyErr_BadArgument();
5228 return NULL;
5229 }
5230 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 PyUnicode_GET_SIZE(unicode),
5232 NULL,
5233 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234}
5235
5236/* --- Unicode Escape Codec ----------------------------------------------- */
5237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005238/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5239 if all the escapes in the string make it still a valid ASCII string.
5240 Returns -1 if any escapes were found which cause the string to
5241 pop out of ASCII range. Otherwise returns the length of the
5242 required buffer to hold the string.
5243 */
5244Py_ssize_t
5245length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5246{
5247 const unsigned char *p = (const unsigned char *)s;
5248 const unsigned char *end = p + size;
5249 Py_ssize_t length = 0;
5250
5251 if (size < 0)
5252 return -1;
5253
5254 for (; p < end; ++p) {
5255 if (*p > 127) {
5256 /* Non-ASCII */
5257 return -1;
5258 }
5259 else if (*p != '\\') {
5260 /* Normal character */
5261 ++length;
5262 }
5263 else {
5264 /* Backslash-escape, check next char */
5265 ++p;
5266 /* Escape sequence reaches till end of string or
5267 non-ASCII follow-up. */
5268 if (p >= end || *p > 127)
5269 return -1;
5270 switch (*p) {
5271 case '\n':
5272 /* backslash + \n result in zero characters */
5273 break;
5274 case '\\': case '\'': case '\"':
5275 case 'b': case 'f': case 't':
5276 case 'n': case 'r': case 'v': case 'a':
5277 ++length;
5278 break;
5279 case '0': case '1': case '2': case '3':
5280 case '4': case '5': case '6': case '7':
5281 case 'x': case 'u': case 'U': case 'N':
5282 /* these do not guarantee ASCII characters */
5283 return -1;
5284 default:
5285 /* count the backslash + the other character */
5286 length += 2;
5287 }
5288 }
5289 }
5290 return length;
5291}
5292
5293/* Similar to PyUnicode_WRITE but either write into wstr field
5294 or treat string as ASCII. */
5295#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5296 do { \
5297 if ((kind) != PyUnicode_WCHAR_KIND) \
5298 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5299 else \
5300 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5301 } while (0)
5302
5303#define WRITE_WSTR(buf, index, value) \
5304 assert(kind == PyUnicode_WCHAR_KIND), \
5305 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5306
5307
Fredrik Lundh06d12682001-01-24 07:59:11 +00005308static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005309
Alexander Belopolsky40018472011-02-26 01:02:56 +00005310PyObject *
5311PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005312 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005313 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005315 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005316 Py_ssize_t startinpos;
5317 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005318 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005322 char* message;
5323 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 PyObject *errorHandler = NULL;
5325 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005326 Py_ssize_t ascii_length;
5327 Py_ssize_t i;
5328 int kind;
5329 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005331 ascii_length = length_of_escaped_ascii_string(s, size);
5332
5333 /* After length_of_escaped_ascii_string() there are two alternatives,
5334 either the string is pure ASCII with named escapes like \n, etc.
5335 and we determined it's exact size (common case)
5336 or it contains \x, \u, ... escape sequences. then we create a
5337 legacy wchar string and resize it at the end of this function. */
5338 if (ascii_length >= 0) {
5339 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5340 if (!v)
5341 goto onError;
5342 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5343 kind = PyUnicode_1BYTE_KIND;
5344 data = PyUnicode_DATA(v);
5345 }
5346 else {
5347 /* Escaped strings will always be longer than the resulting
5348 Unicode string, so we start with size here and then reduce the
5349 length after conversion to the true value.
5350 (but if the error callback returns a long replacement string
5351 we'll have to allocate more space) */
5352 v = _PyUnicode_New(size);
5353 if (!v)
5354 goto onError;
5355 kind = PyUnicode_WCHAR_KIND;
5356 data = PyUnicode_AS_UNICODE(v);
5357 }
5358
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 if (size == 0)
5360 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005363
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 while (s < end) {
5365 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005366 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 if (kind == PyUnicode_WCHAR_KIND) {
5370 assert(i < _PyUnicode_WSTR_LENGTH(v));
5371 }
5372 else {
5373 /* The only case in which i == ascii_length is a backslash
5374 followed by a newline. */
5375 assert(i <= ascii_length);
5376 }
5377
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 /* Non-escape characters are interpreted as Unicode ordinals */
5379 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005380 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 continue;
5382 }
5383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 /* \ - Escapes */
5386 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005387 c = *s++;
5388 if (s > end)
5389 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005390
5391 if (kind == PyUnicode_WCHAR_KIND) {
5392 assert(i < _PyUnicode_WSTR_LENGTH(v));
5393 }
5394 else {
5395 /* The only case in which i == ascii_length is a backslash
5396 followed by a newline. */
5397 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5398 }
5399
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005400 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5405 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5406 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5407 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5408 /* FF */
5409 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5410 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5411 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5412 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5413 /* VT */
5414 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5415 /* BEL, not classic C */
5416 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 case '0': case '1': case '2': case '3':
5420 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005421 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005422 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005423 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005424 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005425 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 break;
5429
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 /* hex escapes */
5431 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005433 digits = 2;
5434 message = "truncated \\xXX escape";
5435 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005439 digits = 4;
5440 message = "truncated \\uXXXX escape";
5441 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005444 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005445 digits = 8;
5446 message = "truncated \\UXXXXXXXX escape";
5447 hexescape:
5448 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 if (s+digits>end) {
5451 endinpos = size;
5452 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 errors, &errorHandler,
5454 "unicodeescape", "end of string in escape sequence",
5455 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005458 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 goto nextByte;
5460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 for (j = 0; j < digits; ++j) {
5462 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005463 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464 endinpos = (s+j+1)-starts;
5465 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 errors, &errorHandler,
5468 "unicodeescape", message,
5469 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005474 }
5475 chr = (chr<<4) & ~0xF;
5476 if (c >= '0' && c <= '9')
5477 chr += c - '0';
5478 else if (c >= 'a' && c <= 'f')
5479 chr += 10 + c - 'a';
5480 else
5481 chr += 10 + c - 'A';
5482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005483 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005484 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 /* _decoding_error will have already written into the
5486 target buffer. */
5487 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005488 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005489 /* when we get here, chr is a 32-bit unicode character */
5490 if (chr <= 0xffff)
5491 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005493 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005494 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005495 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005496#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005497 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005498#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005499 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5501 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005502#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005503 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 errors, &errorHandler,
5508 "unicodeescape", "illegal Unicode character",
5509 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005511 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005512 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005513 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005514 break;
5515
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005517 case 'N':
5518 message = "malformed \\N character escape";
5519 if (ucnhash_CAPI == NULL) {
5520 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5522 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523 if (ucnhash_CAPI == NULL)
5524 goto ucnhashError;
5525 }
5526 if (*s == '{') {
5527 const char *start = s+1;
5528 /* look for the closing brace */
5529 while (*s != '}' && s < end)
5530 s++;
5531 if (s > start && s < end && *s == '}') {
5532 /* found a name. look it up in the unicode database */
5533 message = "unknown Unicode character name";
5534 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5536 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005537 goto store;
5538 }
5539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 errors, &errorHandler,
5544 "unicodeescape", message,
5545 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005547 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005549 break;
5550
5551 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005552 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 message = "\\ at end of string";
5555 s--;
5556 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 errors, &errorHandler,
5560 "unicodeescape", message,
5561 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005563 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005565 }
5566 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5568 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005569 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005570 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 /* Ensure the length prediction worked in case of ASCII strings */
5576 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5577
Victor Stinnerfe226c02011-10-03 03:52:20 +02005578 if (kind == PyUnicode_WCHAR_KIND)
5579 {
5580 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5581 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005582 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005583 Py_XDECREF(errorHandler);
5584 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005585 if (_PyUnicode_READY_REPLACE(&v)) {
5586 Py_DECREF(v);
5587 return NULL;
5588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005590
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005592 PyErr_SetString(
5593 PyExc_UnicodeError,
5594 "\\N escapes not supported (can't load unicodedata module)"
5595 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005596 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005599 return NULL;
5600
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 Py_XDECREF(errorHandler);
5604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return NULL;
5606}
5607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608#undef WRITE_ASCII_OR_WSTR
5609#undef WRITE_WSTR
5610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611/* Return a Unicode-Escape string version of the Unicode object.
5612
5613 If quotes is true, the string is enclosed in u"" or u'' quotes as
5614 appropriate.
5615
5616*/
5617
Walter Dörwald79e913e2007-05-12 11:08:06 +00005618static const char *hexdigits = "0123456789abcdef";
5619
Alexander Belopolsky40018472011-02-26 01:02:56 +00005620PyObject *
5621PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005622 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005624 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005627#ifdef Py_UNICODE_WIDE
5628 const Py_ssize_t expandsize = 10;
5629#else
5630 const Py_ssize_t expandsize = 6;
5631#endif
5632
Thomas Wouters89f507f2006-12-13 04:49:30 +00005633 /* XXX(nnorwitz): rather than over-allocating, it would be
5634 better to choose a different scheme. Perhaps scan the
5635 first N-chars of the string and allocate based on that size.
5636 */
5637 /* Initial allocation is based on the longest-possible unichr
5638 escape.
5639
5640 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5641 unichr, so in this case it's the longest unichr escape. In
5642 narrow (UTF-16) builds this is five chars per source unichr
5643 since there are two unichrs in the surrogate pair, so in narrow
5644 (UTF-16) builds it's not the longest unichr escape.
5645
5646 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5647 so in the narrow (UTF-16) build case it's the longest unichr
5648 escape.
5649 */
5650
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005651 if (size == 0)
5652 return PyBytes_FromStringAndSize(NULL, 0);
5653
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005654 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005656
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005657 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 2
5659 + expandsize*size
5660 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 if (repr == NULL)
5662 return NULL;
5663
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005664 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 while (size-- > 0) {
5667 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005668
Walter Dörwald79e913e2007-05-12 11:08:06 +00005669 /* Escape backslashes */
5670 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 *p++ = '\\';
5672 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005673 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005674 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005675
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005676#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005677 /* Map 21-bit characters to '\U00xxxxxx' */
5678 else if (ch >= 0x10000) {
5679 *p++ = '\\';
5680 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005681 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5682 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5683 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5684 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5685 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5686 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5687 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5688 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005690 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005691#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5693 else if (ch >= 0xD800 && ch < 0xDC00) {
5694 Py_UNICODE ch2;
5695 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005696
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 ch2 = *s++;
5698 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005699 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5701 *p++ = '\\';
5702 *p++ = 'U';
5703 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5704 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5705 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5706 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5707 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5708 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5709 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5710 *p++ = hexdigits[ucs & 0x0000000F];
5711 continue;
5712 }
5713 /* Fall through: isolated surrogates are copied as-is */
5714 s--;
5715 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005717#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005720 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 *p++ = '\\';
5722 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005723 *p++ = hexdigits[(ch >> 12) & 0x000F];
5724 *p++ = hexdigits[(ch >> 8) & 0x000F];
5725 *p++ = hexdigits[(ch >> 4) & 0x000F];
5726 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005728
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005729 /* Map special whitespace to '\t', \n', '\r' */
5730 else if (ch == '\t') {
5731 *p++ = '\\';
5732 *p++ = 't';
5733 }
5734 else if (ch == '\n') {
5735 *p++ = '\\';
5736 *p++ = 'n';
5737 }
5738 else if (ch == '\r') {
5739 *p++ = '\\';
5740 *p++ = 'r';
5741 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005742
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005743 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005744 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005746 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005747 *p++ = hexdigits[(ch >> 4) & 0x000F];
5748 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005749 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005750
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 /* Copy everything else as-is */
5752 else
5753 *p++ = (char) ch;
5754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005756 assert(p - PyBytes_AS_STRING(repr) > 0);
5757 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5758 return NULL;
5759 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760}
5761
Alexander Belopolsky40018472011-02-26 01:02:56 +00005762PyObject *
5763PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005765 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 if (!PyUnicode_Check(unicode)) {
5767 PyErr_BadArgument();
5768 return NULL;
5769 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005770 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5771 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005772 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773}
5774
5775/* --- Raw Unicode Escape Codec ------------------------------------------- */
5776
Alexander Belopolsky40018472011-02-26 01:02:56 +00005777PyObject *
5778PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005779 Py_ssize_t size,
5780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005783 Py_ssize_t startinpos;
5784 Py_ssize_t endinpos;
5785 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 const char *end;
5789 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 PyObject *errorHandler = NULL;
5791 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005792
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 /* Escaped strings will always be longer than the resulting
5794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 length after conversion to the true value. (But decoding error
5796 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 v = _PyUnicode_New(size);
5798 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 end = s + size;
5804 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 unsigned char c;
5806 Py_UCS4 x;
5807 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005808 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 /* Non-escape characters are interpreted as Unicode ordinals */
5811 if (*s != '\\') {
5812 *p++ = (unsigned char)*s++;
5813 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 startinpos = s-starts;
5816
5817 /* \u-escapes are only interpreted iff the number of leading
5818 backslashes if odd */
5819 bs = s;
5820 for (;s < end;) {
5821 if (*s != '\\')
5822 break;
5823 *p++ = (unsigned char)*s++;
5824 }
5825 if (((s - bs) & 1) == 0 ||
5826 s >= end ||
5827 (*s != 'u' && *s != 'U')) {
5828 continue;
5829 }
5830 p--;
5831 count = *s=='u' ? 4 : 8;
5832 s++;
5833
5834 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5835 outpos = p-PyUnicode_AS_UNICODE(v);
5836 for (x = 0, i = 0; i < count; ++i, ++s) {
5837 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005838 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 endinpos = s-starts;
5840 if (unicode_decode_call_errorhandler(
5841 errors, &errorHandler,
5842 "rawunicodeescape", "truncated \\uXXXX",
5843 &starts, &end, &startinpos, &endinpos, &exc, &s,
5844 &v, &outpos, &p))
5845 goto onError;
5846 goto nextByte;
5847 }
5848 x = (x<<4) & ~0xF;
5849 if (c >= '0' && c <= '9')
5850 x += c - '0';
5851 else if (c >= 'a' && c <= 'f')
5852 x += 10 + c - 'a';
5853 else
5854 x += 10 + c - 'A';
5855 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005856 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 /* UCS-2 character */
5858 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005859 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 /* UCS-4 character. Either store directly, or as
5861 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005862#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005864#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 x -= 0x10000L;
5866 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5867 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005868#endif
5869 } else {
5870 endinpos = s-starts;
5871 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005872 if (unicode_decode_call_errorhandler(
5873 errors, &errorHandler,
5874 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 &starts, &end, &startinpos, &endinpos, &exc, &s,
5876 &v, &outpos, &p))
5877 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005878 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 nextByte:
5880 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005882 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 Py_XDECREF(errorHandler);
5885 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005886 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 Py_DECREF(v);
5888 return NULL;
5889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005891
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 Py_XDECREF(errorHandler);
5895 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 return NULL;
5897}
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
5900PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 char *p;
5905 char *q;
5906
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005907#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005908 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005909#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005910 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005911#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005912
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005913 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005915
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005916 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 if (repr == NULL)
5918 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005919 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005920 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005922 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 while (size-- > 0) {
5924 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005925#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 /* Map 32-bit characters to '\Uxxxxxxxx' */
5927 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005928 *p++ = '\\';
5929 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005930 *p++ = hexdigits[(ch >> 28) & 0xf];
5931 *p++ = hexdigits[(ch >> 24) & 0xf];
5932 *p++ = hexdigits[(ch >> 20) & 0xf];
5933 *p++ = hexdigits[(ch >> 16) & 0xf];
5934 *p++ = hexdigits[(ch >> 12) & 0xf];
5935 *p++ = hexdigits[(ch >> 8) & 0xf];
5936 *p++ = hexdigits[(ch >> 4) & 0xf];
5937 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005938 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005939 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005940#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5942 if (ch >= 0xD800 && ch < 0xDC00) {
5943 Py_UNICODE ch2;
5944 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005945
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 ch2 = *s++;
5947 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005948 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5950 *p++ = '\\';
5951 *p++ = 'U';
5952 *p++ = hexdigits[(ucs >> 28) & 0xf];
5953 *p++ = hexdigits[(ucs >> 24) & 0xf];
5954 *p++ = hexdigits[(ucs >> 20) & 0xf];
5955 *p++ = hexdigits[(ucs >> 16) & 0xf];
5956 *p++ = hexdigits[(ucs >> 12) & 0xf];
5957 *p++ = hexdigits[(ucs >> 8) & 0xf];
5958 *p++ = hexdigits[(ucs >> 4) & 0xf];
5959 *p++ = hexdigits[ucs & 0xf];
5960 continue;
5961 }
5962 /* Fall through: isolated surrogates are copied as-is */
5963 s--;
5964 size++;
5965 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005966#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 /* Map 16-bit characters to '\uxxxx' */
5968 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 *p++ = '\\';
5970 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005971 *p++ = hexdigits[(ch >> 12) & 0xf];
5972 *p++ = hexdigits[(ch >> 8) & 0xf];
5973 *p++ = hexdigits[(ch >> 4) & 0xf];
5974 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* Copy everything else as-is */
5977 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 *p++ = (char) ch;
5979 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005980 size = p - q;
5981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 assert(size > 0);
5983 if (_PyBytes_Resize(&repr, size) < 0)
5984 return NULL;
5985 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986}
5987
Alexander Belopolsky40018472011-02-26 01:02:56 +00005988PyObject *
5989PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005991 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005993 PyErr_BadArgument();
5994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005996 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5997 PyUnicode_GET_SIZE(unicode));
5998
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005999 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000}
6001
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006002/* --- Unicode Internal Codec ------------------------------------------- */
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
6005_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006006 Py_ssize_t size,
6007 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006008{
6009 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 Py_ssize_t startinpos;
6011 Py_ssize_t endinpos;
6012 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006013 PyUnicodeObject *v;
6014 Py_UNICODE *p;
6015 const char *end;
6016 const char *reason;
6017 PyObject *errorHandler = NULL;
6018 PyObject *exc = NULL;
6019
Neal Norwitzd43069c2006-01-08 01:12:10 +00006020#ifdef Py_UNICODE_WIDE
6021 Py_UNICODE unimax = PyUnicode_GetMax();
6022#endif
6023
Thomas Wouters89f507f2006-12-13 04:49:30 +00006024 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006025 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6026 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006028 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6029 as string was created with the old API. */
6030 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006032 p = PyUnicode_AS_UNICODE(v);
6033 end = s + size;
6034
6035 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006036 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037 /* We have to sanity check the raw data, otherwise doom looms for
6038 some malformed UCS-4 data. */
6039 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006040#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006041 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006042#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006043 end-s < Py_UNICODE_SIZE
6044 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 startinpos = s - starts;
6047 if (end-s < Py_UNICODE_SIZE) {
6048 endinpos = end-starts;
6049 reason = "truncated input";
6050 }
6051 else {
6052 endinpos = s - starts + Py_UNICODE_SIZE;
6053 reason = "illegal code point (> 0x10FFFF)";
6054 }
6055 outpos = p - PyUnicode_AS_UNICODE(v);
6056 if (unicode_decode_call_errorhandler(
6057 errors, &errorHandler,
6058 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006059 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006060 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006061 goto onError;
6062 }
6063 }
6064 else {
6065 p++;
6066 s += Py_UNICODE_SIZE;
6067 }
6068 }
6069
Victor Stinnerfe226c02011-10-03 03:52:20 +02006070 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006071 goto onError;
6072 Py_XDECREF(errorHandler);
6073 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006074 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006075 Py_DECREF(v);
6076 return NULL;
6077 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006078 return (PyObject *)v;
6079
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006081 Py_XDECREF(v);
6082 Py_XDECREF(errorHandler);
6083 Py_XDECREF(exc);
6084 return NULL;
6085}
6086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087/* --- Latin-1 Codec ------------------------------------------------------ */
6088
Alexander Belopolsky40018472011-02-26 01:02:56 +00006089PyObject *
6090PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006091 Py_ssize_t size,
6092 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006095 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099static void
6100make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006101 const char *encoding,
6102 const Py_UNICODE *unicode, Py_ssize_t size,
6103 Py_ssize_t startpos, Py_ssize_t endpos,
6104 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 *exceptionObject = PyUnicodeEncodeError_Create(
6108 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
6110 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6112 goto onError;
6113 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6114 goto onError;
6115 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6116 goto onError;
6117 return;
6118 onError:
6119 Py_DECREF(*exceptionObject);
6120 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 }
6122}
6123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125static void
6126raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006127 const char *encoding,
6128 const Py_UNICODE *unicode, Py_ssize_t size,
6129 Py_ssize_t startpos, Py_ssize_t endpos,
6130 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131{
6132 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136}
6137
6138/* error handling callback helper:
6139 build arguments, call the callback and check the arguments,
6140 put the result into newpos and return the replacement string, which
6141 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006142static PyObject *
6143unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006144 PyObject **errorHandler,
6145 const char *encoding, const char *reason,
6146 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6147 Py_ssize_t startpos, Py_ssize_t endpos,
6148 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006150 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151
6152 PyObject *restuple;
6153 PyObject *resunicode;
6154
6155 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006157 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 }
6160
6161 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165
6166 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006171 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 Py_DECREF(restuple);
6173 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006175 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 &resunicode, newpos)) {
6177 Py_DECREF(restuple);
6178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006180 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6181 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6182 Py_DECREF(restuple);
6183 return NULL;
6184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006187 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6189 Py_DECREF(restuple);
6190 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006191 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 Py_INCREF(resunicode);
6193 Py_DECREF(restuple);
6194 return resunicode;
6195}
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197static PyObject *
6198unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006199 Py_ssize_t size,
6200 const char *errors,
6201 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202{
6203 /* output object */
6204 PyObject *res;
6205 /* pointers to the beginning and end+1 of input */
6206 const Py_UNICODE *startp = p;
6207 const Py_UNICODE *endp = p + size;
6208 /* pointer to the beginning of the unencodable characters */
6209 /* const Py_UNICODE *badp = NULL; */
6210 /* pointer into the output */
6211 char *str;
6212 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006213 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006214 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6215 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 PyObject *errorHandler = NULL;
6217 PyObject *exc = NULL;
6218 /* the following variable is used for caching string comparisons
6219 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6220 int known_errorHandler = -1;
6221
6222 /* allocate enough for a simple encoding without
6223 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006224 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006225 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006226 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006228 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006229 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006230 ressize = size;
6231
6232 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 /* can we encode this? */
6236 if (c<limit) {
6237 /* no overflow check, because we know that the space is enough */
6238 *str++ = (char)c;
6239 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 else {
6242 Py_ssize_t unicodepos = p-startp;
6243 Py_ssize_t requiredsize;
6244 PyObject *repunicode;
6245 Py_ssize_t repsize;
6246 Py_ssize_t newpos;
6247 Py_ssize_t respos;
6248 Py_UNICODE *uni2;
6249 /* startpos for collecting unencodable chars */
6250 const Py_UNICODE *collstart = p;
6251 const Py_UNICODE *collend = p;
6252 /* find all unecodable characters */
6253 while ((collend < endp) && ((*collend)>=limit))
6254 ++collend;
6255 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6256 if (known_errorHandler==-1) {
6257 if ((errors==NULL) || (!strcmp(errors, "strict")))
6258 known_errorHandler = 1;
6259 else if (!strcmp(errors, "replace"))
6260 known_errorHandler = 2;
6261 else if (!strcmp(errors, "ignore"))
6262 known_errorHandler = 3;
6263 else if (!strcmp(errors, "xmlcharrefreplace"))
6264 known_errorHandler = 4;
6265 else
6266 known_errorHandler = 0;
6267 }
6268 switch (known_errorHandler) {
6269 case 1: /* strict */
6270 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6271 goto onError;
6272 case 2: /* replace */
6273 while (collstart++<collend)
6274 *str++ = '?'; /* fall through */
6275 case 3: /* ignore */
6276 p = collend;
6277 break;
6278 case 4: /* xmlcharrefreplace */
6279 respos = str - PyBytes_AS_STRING(res);
6280 /* determine replacement size (temporarily (mis)uses p) */
6281 for (p = collstart, repsize = 0; p < collend; ++p) {
6282 if (*p<10)
6283 repsize += 2+1+1;
6284 else if (*p<100)
6285 repsize += 2+2+1;
6286 else if (*p<1000)
6287 repsize += 2+3+1;
6288 else if (*p<10000)
6289 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006290#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 else
6292 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006293#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 else if (*p<100000)
6295 repsize += 2+5+1;
6296 else if (*p<1000000)
6297 repsize += 2+6+1;
6298 else
6299 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006300#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 }
6302 requiredsize = respos+repsize+(endp-collend);
6303 if (requiredsize > ressize) {
6304 if (requiredsize<2*ressize)
6305 requiredsize = 2*ressize;
6306 if (_PyBytes_Resize(&res, requiredsize))
6307 goto onError;
6308 str = PyBytes_AS_STRING(res) + respos;
6309 ressize = requiredsize;
6310 }
6311 /* generate replacement (temporarily (mis)uses p) */
6312 for (p = collstart; p < collend; ++p) {
6313 str += sprintf(str, "&#%d;", (int)*p);
6314 }
6315 p = collend;
6316 break;
6317 default:
6318 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6319 encoding, reason, startp, size, &exc,
6320 collstart-startp, collend-startp, &newpos);
6321 if (repunicode == NULL)
6322 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006323 if (PyBytes_Check(repunicode)) {
6324 /* Directly copy bytes result to output. */
6325 repsize = PyBytes_Size(repunicode);
6326 if (repsize > 1) {
6327 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006328 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006329 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6330 Py_DECREF(repunicode);
6331 goto onError;
6332 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006333 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006334 ressize += repsize-1;
6335 }
6336 memcpy(str, PyBytes_AsString(repunicode), repsize);
6337 str += repsize;
6338 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006339 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006340 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 /* need more space? (at least enough for what we
6343 have+the replacement+the rest of the string, so
6344 we won't have to check space for encodable characters) */
6345 respos = str - PyBytes_AS_STRING(res);
6346 repsize = PyUnicode_GET_SIZE(repunicode);
6347 requiredsize = respos+repsize+(endp-collend);
6348 if (requiredsize > ressize) {
6349 if (requiredsize<2*ressize)
6350 requiredsize = 2*ressize;
6351 if (_PyBytes_Resize(&res, requiredsize)) {
6352 Py_DECREF(repunicode);
6353 goto onError;
6354 }
6355 str = PyBytes_AS_STRING(res) + respos;
6356 ressize = requiredsize;
6357 }
6358 /* check if there is anything unencodable in the replacement
6359 and copy it to the output */
6360 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6361 c = *uni2;
6362 if (c >= limit) {
6363 raise_encode_exception(&exc, encoding, startp, size,
6364 unicodepos, unicodepos+1, reason);
6365 Py_DECREF(repunicode);
6366 goto onError;
6367 }
6368 *str = (char)c;
6369 }
6370 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006373 }
6374 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006375 /* Resize if we allocated to much */
6376 size = str - PyBytes_AS_STRING(res);
6377 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006378 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006379 if (_PyBytes_Resize(&res, size) < 0)
6380 goto onError;
6381 }
6382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 Py_XDECREF(errorHandler);
6384 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006385 return res;
6386
6387 onError:
6388 Py_XDECREF(res);
6389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
6391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392}
6393
Alexander Belopolsky40018472011-02-26 01:02:56 +00006394PyObject *
6395PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006396 Py_ssize_t size,
6397 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400}
6401
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006403_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
6405 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 PyErr_BadArgument();
6407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006409 if (PyUnicode_READY(unicode) == -1)
6410 return NULL;
6411 /* Fast path: if it is a one-byte string, construct
6412 bytes object directly. */
6413 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6414 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6415 PyUnicode_GET_LENGTH(unicode));
6416 /* Non-Latin-1 characters present. Defer to above function to
6417 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006420 errors);
6421}
6422
6423PyObject*
6424PyUnicode_AsLatin1String(PyObject *unicode)
6425{
6426 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
6429/* --- 7-bit ASCII Codec -------------------------------------------------- */
6430
Alexander Belopolsky40018472011-02-26 01:02:56 +00006431PyObject *
6432PyUnicode_DecodeASCII(const char *s,
6433 Py_ssize_t size,
6434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 PyUnicodeObject *v;
6438 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006439 Py_ssize_t startinpos;
6440 Py_ssize_t endinpos;
6441 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006443 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 PyObject *errorHandler = NULL;
6445 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006446 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006447
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006449 if (size == 1 && *(unsigned char*)s < 128)
6450 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6451
6452 /* Fast path. Assume the input actually *is* ASCII, and allocate
6453 a single-block Unicode object with that assumption. If there is
6454 an error, drop the object and start over. */
6455 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6456 if (v == NULL)
6457 goto onError;
6458 d = PyUnicode_1BYTE_DATA(v);
6459 for (i = 0; i < size; i++) {
6460 unsigned char ch = ((unsigned char*)s)[i];
6461 if (ch < 128)
6462 d[i] = ch;
6463 else
6464 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006466 if (i == size)
6467 return (PyObject*)v;
6468 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 v = _PyUnicode_New(size);
6471 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 e = s + size;
6477 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 register unsigned char c = (unsigned char)*s;
6479 if (c < 128) {
6480 *p++ = c;
6481 ++s;
6482 }
6483 else {
6484 startinpos = s-starts;
6485 endinpos = startinpos + 1;
6486 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6487 if (unicode_decode_call_errorhandler(
6488 errors, &errorHandler,
6489 "ascii", "ordinal not in range(128)",
6490 &starts, &e, &startinpos, &endinpos, &exc, &s,
6491 &v, &outpos, &p))
6492 goto onError;
6493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006495 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006496 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 Py_XDECREF(errorHandler);
6499 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006500 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006501 Py_DECREF(v);
6502 return NULL;
6503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006505
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508 Py_XDECREF(errorHandler);
6509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 return NULL;
6511}
6512
Alexander Belopolsky40018472011-02-26 01:02:56 +00006513PyObject *
6514PyUnicode_EncodeASCII(const Py_UNICODE *p,
6515 Py_ssize_t size,
6516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519}
6520
Alexander Belopolsky40018472011-02-26 01:02:56 +00006521PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006522_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
6524 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 PyErr_BadArgument();
6526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006528 if (PyUnicode_READY(unicode) == -1)
6529 return NULL;
6530 /* Fast path: if it is an ASCII-only string, construct bytes object
6531 directly. Else defer to above function to raise the exception. */
6532 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6533 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6534 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006537 errors);
6538}
6539
6540PyObject *
6541PyUnicode_AsASCIIString(PyObject *unicode)
6542{
6543 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544}
6545
Victor Stinner99b95382011-07-04 14:23:54 +02006546#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006547
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006548/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006549
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006550#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006551#define NEED_RETRY
6552#endif
6553
6554/* XXX This code is limited to "true" double-byte encodings, as
6555 a) it assumes an incomplete character consists of a single byte, and
6556 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006558
Alexander Belopolsky40018472011-02-26 01:02:56 +00006559static int
6560is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006561{
6562 const char *curr = s + offset;
6563
6564 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 const char *prev = CharPrev(s, curr);
6566 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567 }
6568 return 0;
6569}
6570
6571/*
6572 * Decode MBCS string into unicode object. If 'final' is set, converts
6573 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6574 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575static int
6576decode_mbcs(PyUnicodeObject **v,
6577 const char *s, /* MBCS string */
6578 int size, /* sizeof MBCS string */
6579 int final,
6580 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006581{
6582 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006583 Py_ssize_t n;
6584 DWORD usize;
6585 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586
6587 assert(size >= 0);
6588
Victor Stinner554f3f02010-06-16 23:33:54 +00006589 /* check and handle 'errors' arg */
6590 if (errors==NULL || strcmp(errors, "strict")==0)
6591 flags = MB_ERR_INVALID_CHARS;
6592 else if (strcmp(errors, "ignore")==0)
6593 flags = 0;
6594 else {
6595 PyErr_Format(PyExc_ValueError,
6596 "mbcs encoding does not support errors='%s'",
6597 errors);
6598 return -1;
6599 }
6600
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601 /* Skip trailing lead-byte unless 'final' is set */
6602 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604
6605 /* First get the size of the result */
6606 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006607 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6608 if (usize==0)
6609 goto mbcs_decode_error;
6610 } else
6611 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006612
6613 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 /* Create unicode object */
6615 *v = _PyUnicode_New(usize);
6616 if (*v == NULL)
6617 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006618 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006619 }
6620 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 /* Extend unicode object */
6622 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006623 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006625 }
6626
6627 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006628 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006630 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6631 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006633 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006635
6636mbcs_decode_error:
6637 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6638 we raise a UnicodeDecodeError - else it is a 'generic'
6639 windows error
6640 */
6641 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6642 /* Ideally, we should get reason from FormatMessage - this
6643 is the Windows 2000 English version of the message
6644 */
6645 PyObject *exc = NULL;
6646 const char *reason = "No mapping for the Unicode character exists "
6647 "in the target multi-byte code page.";
6648 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6649 if (exc != NULL) {
6650 PyCodec_StrictErrors(exc);
6651 Py_DECREF(exc);
6652 }
6653 } else {
6654 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6655 }
6656 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006657}
6658
Alexander Belopolsky40018472011-02-26 01:02:56 +00006659PyObject *
6660PyUnicode_DecodeMBCSStateful(const char *s,
6661 Py_ssize_t size,
6662 const char *errors,
6663 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664{
6665 PyUnicodeObject *v = NULL;
6666 int done;
6667
6668 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670
6671#ifdef NEED_RETRY
6672 retry:
6673 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006674 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006675 else
6676#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006677 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678
6679 if (done < 0) {
6680 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006682 }
6683
6684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686
6687#ifdef NEED_RETRY
6688 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 s += done;
6690 size -= done;
6691 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692 }
6693#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006694 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006695 Py_DECREF(v);
6696 return NULL;
6697 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006698 return (PyObject *)v;
6699}
6700
Alexander Belopolsky40018472011-02-26 01:02:56 +00006701PyObject *
6702PyUnicode_DecodeMBCS(const char *s,
6703 Py_ssize_t size,
6704 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006705{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6707}
6708
6709/*
6710 * Convert unicode into string object (MBCS).
6711 * Returns 0 if succeed, -1 otherwise.
6712 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006713static int
6714encode_mbcs(PyObject **repr,
6715 const Py_UNICODE *p, /* unicode */
6716 int size, /* size of unicode */
6717 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718{
Victor Stinner554f3f02010-06-16 23:33:54 +00006719 BOOL usedDefaultChar = FALSE;
6720 BOOL *pusedDefaultChar;
6721 int mbcssize;
6722 Py_ssize_t n;
6723 PyObject *exc = NULL;
6724 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725
6726 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006727
Victor Stinner554f3f02010-06-16 23:33:54 +00006728 /* check and handle 'errors' arg */
6729 if (errors==NULL || strcmp(errors, "strict")==0) {
6730 flags = WC_NO_BEST_FIT_CHARS;
6731 pusedDefaultChar = &usedDefaultChar;
6732 } else if (strcmp(errors, "replace")==0) {
6733 flags = 0;
6734 pusedDefaultChar = NULL;
6735 } else {
6736 PyErr_Format(PyExc_ValueError,
6737 "mbcs encoding does not support errors='%s'",
6738 errors);
6739 return -1;
6740 }
6741
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006742 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006743 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6745 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 if (mbcssize == 0) {
6747 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6748 return -1;
6749 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006750 /* If we used a default char, then we failed! */
6751 if (pusedDefaultChar && *pusedDefaultChar)
6752 goto mbcs_encode_error;
6753 } else {
6754 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006755 }
6756
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 /* Create string object */
6759 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6760 if (*repr == NULL)
6761 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006762 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763 }
6764 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 /* Extend string object */
6766 n = PyBytes_Size(*repr);
6767 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6768 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 }
6770
6771 /* Do the conversion */
6772 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006774 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6775 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6777 return -1;
6778 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 if (pusedDefaultChar && *pusedDefaultChar)
6780 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006783
6784mbcs_encode_error:
6785 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6786 Py_XDECREF(exc);
6787 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006788}
6789
Alexander Belopolsky40018472011-02-26 01:02:56 +00006790PyObject *
6791PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6792 Py_ssize_t size,
6793 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006794{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006795 PyObject *repr = NULL;
6796 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006797
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006801 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802 else
6803#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006804 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006805
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 Py_XDECREF(repr);
6808 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006809 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
6811#ifdef NEED_RETRY
6812 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 p += INT_MAX;
6814 size -= INT_MAX;
6815 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816 }
6817#endif
6818
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006819 return repr;
6820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006821
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822PyObject *
6823PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006824{
6825 if (!PyUnicode_Check(unicode)) {
6826 PyErr_BadArgument();
6827 return NULL;
6828 }
6829 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 PyUnicode_GET_SIZE(unicode),
6831 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006832}
6833
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834#undef NEED_RETRY
6835
Victor Stinner99b95382011-07-04 14:23:54 +02006836#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006837
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838/* --- Character Mapping Codec -------------------------------------------- */
6839
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840PyObject *
6841PyUnicode_DecodeCharmap(const char *s,
6842 Py_ssize_t size,
6843 PyObject *mapping,
6844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006847 Py_ssize_t startinpos;
6848 Py_ssize_t endinpos;
6849 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 PyUnicodeObject *v;
6852 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006853 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006854 PyObject *errorHandler = NULL;
6855 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006856 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006857 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 /* Default to Latin-1 */
6860 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863 v = _PyUnicode_New(size);
6864 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006870 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 mapstring = PyUnicode_AS_UNICODE(mapping);
6872 maplen = PyUnicode_GET_SIZE(mapping);
6873 while (s < e) {
6874 unsigned char ch = *s;
6875 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 if (ch < maplen)
6878 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 if (x == 0xfffe) {
6881 /* undefined mapping */
6882 outpos = p-PyUnicode_AS_UNICODE(v);
6883 startinpos = s-starts;
6884 endinpos = startinpos+1;
6885 if (unicode_decode_call_errorhandler(
6886 errors, &errorHandler,
6887 "charmap", "character maps to <undefined>",
6888 &starts, &e, &startinpos, &endinpos, &exc, &s,
6889 &v, &outpos, &p)) {
6890 goto onError;
6891 }
6892 continue;
6893 }
6894 *p++ = x;
6895 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006896 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006897 }
6898 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 while (s < e) {
6900 unsigned char ch = *s;
6901 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006902
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6904 w = PyLong_FromLong((long)ch);
6905 if (w == NULL)
6906 goto onError;
6907 x = PyObject_GetItem(mapping, w);
6908 Py_DECREF(w);
6909 if (x == NULL) {
6910 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6911 /* No mapping found means: mapping is undefined. */
6912 PyErr_Clear();
6913 x = Py_None;
6914 Py_INCREF(x);
6915 } else
6916 goto onError;
6917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006918
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 /* Apply mapping */
6920 if (PyLong_Check(x)) {
6921 long value = PyLong_AS_LONG(x);
6922 if (value < 0 || value > 65535) {
6923 PyErr_SetString(PyExc_TypeError,
6924 "character mapping must be in range(65536)");
6925 Py_DECREF(x);
6926 goto onError;
6927 }
6928 *p++ = (Py_UNICODE)value;
6929 }
6930 else if (x == Py_None) {
6931 /* undefined mapping */
6932 outpos = p-PyUnicode_AS_UNICODE(v);
6933 startinpos = s-starts;
6934 endinpos = startinpos+1;
6935 if (unicode_decode_call_errorhandler(
6936 errors, &errorHandler,
6937 "charmap", "character maps to <undefined>",
6938 &starts, &e, &startinpos, &endinpos, &exc, &s,
6939 &v, &outpos, &p)) {
6940 Py_DECREF(x);
6941 goto onError;
6942 }
6943 Py_DECREF(x);
6944 continue;
6945 }
6946 else if (PyUnicode_Check(x)) {
6947 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006948
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 if (targetsize == 1)
6950 /* 1-1 mapping */
6951 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006952
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 else if (targetsize > 1) {
6954 /* 1-n mapping */
6955 if (targetsize > extrachars) {
6956 /* resize first */
6957 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6958 Py_ssize_t needed = (targetsize - extrachars) + \
6959 (targetsize << 2);
6960 extrachars += needed;
6961 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006962 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 PyUnicode_GET_SIZE(v) + needed) < 0) {
6964 Py_DECREF(x);
6965 goto onError;
6966 }
6967 p = PyUnicode_AS_UNICODE(v) + oldpos;
6968 }
6969 Py_UNICODE_COPY(p,
6970 PyUnicode_AS_UNICODE(x),
6971 targetsize);
6972 p += targetsize;
6973 extrachars -= targetsize;
6974 }
6975 /* 1-0 mapping: skip the character */
6976 }
6977 else {
6978 /* wrong return value */
6979 PyErr_SetString(PyExc_TypeError,
6980 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006981 Py_DECREF(x);
6982 goto onError;
6983 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 Py_DECREF(x);
6985 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
6988 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006989 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 Py_XDECREF(errorHandler);
6992 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006993 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006994 Py_DECREF(v);
6995 return NULL;
6996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006998
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(errorHandler);
7001 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 Py_XDECREF(v);
7003 return NULL;
7004}
7005
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007006/* Charmap encoding: the lookup table */
7007
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 PyObject_HEAD
7010 unsigned char level1[32];
7011 int count2, count3;
7012 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007013};
7014
7015static PyObject*
7016encoding_map_size(PyObject *obj, PyObject* args)
7017{
7018 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007019 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007021}
7022
7023static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 PyDoc_STR("Return the size (in bytes) of this object") },
7026 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007027};
7028
7029static void
7030encoding_map_dealloc(PyObject* o)
7031{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007033}
7034
7035static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 "EncodingMap", /*tp_name*/
7038 sizeof(struct encoding_map), /*tp_basicsize*/
7039 0, /*tp_itemsize*/
7040 /* methods */
7041 encoding_map_dealloc, /*tp_dealloc*/
7042 0, /*tp_print*/
7043 0, /*tp_getattr*/
7044 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007045 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 0, /*tp_repr*/
7047 0, /*tp_as_number*/
7048 0, /*tp_as_sequence*/
7049 0, /*tp_as_mapping*/
7050 0, /*tp_hash*/
7051 0, /*tp_call*/
7052 0, /*tp_str*/
7053 0, /*tp_getattro*/
7054 0, /*tp_setattro*/
7055 0, /*tp_as_buffer*/
7056 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7057 0, /*tp_doc*/
7058 0, /*tp_traverse*/
7059 0, /*tp_clear*/
7060 0, /*tp_richcompare*/
7061 0, /*tp_weaklistoffset*/
7062 0, /*tp_iter*/
7063 0, /*tp_iternext*/
7064 encoding_map_methods, /*tp_methods*/
7065 0, /*tp_members*/
7066 0, /*tp_getset*/
7067 0, /*tp_base*/
7068 0, /*tp_dict*/
7069 0, /*tp_descr_get*/
7070 0, /*tp_descr_set*/
7071 0, /*tp_dictoffset*/
7072 0, /*tp_init*/
7073 0, /*tp_alloc*/
7074 0, /*tp_new*/
7075 0, /*tp_free*/
7076 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007077};
7078
7079PyObject*
7080PyUnicode_BuildEncodingMap(PyObject* string)
7081{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007082 PyObject *result;
7083 struct encoding_map *mresult;
7084 int i;
7085 int need_dict = 0;
7086 unsigned char level1[32];
7087 unsigned char level2[512];
7088 unsigned char *mlevel1, *mlevel2, *mlevel3;
7089 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007090 int kind;
7091 void *data;
7092 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007094 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007095 PyErr_BadArgument();
7096 return NULL;
7097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 kind = PyUnicode_KIND(string);
7099 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007100 memset(level1, 0xFF, sizeof level1);
7101 memset(level2, 0xFF, sizeof level2);
7102
7103 /* If there isn't a one-to-one mapping of NULL to \0,
7104 or if there are non-BMP characters, we need to use
7105 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007106 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007107 need_dict = 1;
7108 for (i = 1; i < 256; i++) {
7109 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007110 ch = PyUnicode_READ(kind, data, i);
7111 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007112 need_dict = 1;
7113 break;
7114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007116 /* unmapped character */
7117 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007118 l1 = ch >> 11;
7119 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007120 if (level1[l1] == 0xFF)
7121 level1[l1] = count2++;
7122 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007123 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007124 }
7125
7126 if (count2 >= 0xFF || count3 >= 0xFF)
7127 need_dict = 1;
7128
7129 if (need_dict) {
7130 PyObject *result = PyDict_New();
7131 PyObject *key, *value;
7132 if (!result)
7133 return NULL;
7134 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007136 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007137 if (!key || !value)
7138 goto failed1;
7139 if (PyDict_SetItem(result, key, value) == -1)
7140 goto failed1;
7141 Py_DECREF(key);
7142 Py_DECREF(value);
7143 }
7144 return result;
7145 failed1:
7146 Py_XDECREF(key);
7147 Py_XDECREF(value);
7148 Py_DECREF(result);
7149 return NULL;
7150 }
7151
7152 /* Create a three-level trie */
7153 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7154 16*count2 + 128*count3 - 1);
7155 if (!result)
7156 return PyErr_NoMemory();
7157 PyObject_Init(result, &EncodingMapType);
7158 mresult = (struct encoding_map*)result;
7159 mresult->count2 = count2;
7160 mresult->count3 = count3;
7161 mlevel1 = mresult->level1;
7162 mlevel2 = mresult->level23;
7163 mlevel3 = mresult->level23 + 16*count2;
7164 memcpy(mlevel1, level1, 32);
7165 memset(mlevel2, 0xFF, 16*count2);
7166 memset(mlevel3, 0, 128*count3);
7167 count3 = 0;
7168 for (i = 1; i < 256; i++) {
7169 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007171 /* unmapped character */
7172 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007173 o1 = PyUnicode_READ(kind, data, i)>>11;
7174 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007175 i2 = 16*mlevel1[o1] + o2;
7176 if (mlevel2[i2] == 0xFF)
7177 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007178 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007179 i3 = 128*mlevel2[i2] + o3;
7180 mlevel3[i3] = i;
7181 }
7182 return result;
7183}
7184
7185static int
7186encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7187{
7188 struct encoding_map *map = (struct encoding_map*)mapping;
7189 int l1 = c>>11;
7190 int l2 = (c>>7) & 0xF;
7191 int l3 = c & 0x7F;
7192 int i;
7193
7194#ifdef Py_UNICODE_WIDE
7195 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007197 }
7198#endif
7199 if (c == 0)
7200 return 0;
7201 /* level 1*/
7202 i = map->level1[l1];
7203 if (i == 0xFF) {
7204 return -1;
7205 }
7206 /* level 2*/
7207 i = map->level23[16*i+l2];
7208 if (i == 0xFF) {
7209 return -1;
7210 }
7211 /* level 3 */
7212 i = map->level23[16*map->count2 + 128*i + l3];
7213 if (i == 0) {
7214 return -1;
7215 }
7216 return i;
7217}
7218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007219/* Lookup the character ch in the mapping. If the character
7220 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007221 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007222static PyObject *
7223charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224{
Christian Heimes217cfd12007-12-02 14:31:20 +00007225 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 PyObject *x;
7227
7228 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230 x = PyObject_GetItem(mapping, w);
7231 Py_DECREF(w);
7232 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7234 /* No mapping found means: mapping is undefined. */
7235 PyErr_Clear();
7236 x = Py_None;
7237 Py_INCREF(x);
7238 return x;
7239 } else
7240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007242 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007244 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 long value = PyLong_AS_LONG(x);
7246 if (value < 0 || value > 255) {
7247 PyErr_SetString(PyExc_TypeError,
7248 "character mapping must be in range(256)");
7249 Py_DECREF(x);
7250 return NULL;
7251 }
7252 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007254 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 /* wrong return value */
7258 PyErr_Format(PyExc_TypeError,
7259 "character mapping must return integer, bytes or None, not %.400s",
7260 x->ob_type->tp_name);
7261 Py_DECREF(x);
7262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 }
7264}
7265
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007266static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007267charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007268{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007269 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7270 /* exponentially overallocate to minimize reallocations */
7271 if (requiredsize < 2*outsize)
7272 requiredsize = 2*outsize;
7273 if (_PyBytes_Resize(outobj, requiredsize))
7274 return -1;
7275 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007276}
7277
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007280} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007282 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283 space is available. Return a new reference to the object that
7284 was put in the output buffer, or Py_None, if the mapping was undefined
7285 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007286 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007287static charmapencode_result
7288charmapencode_output(Py_UNICODE c, PyObject *mapping,
7289 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007290{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007291 PyObject *rep;
7292 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007293 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294
Christian Heimes90aa7642007-12-19 02:45:37 +00007295 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007296 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007298 if (res == -1)
7299 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 if (outsize<requiredsize)
7301 if (charmapencode_resize(outobj, outpos, requiredsize))
7302 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 outstart[(*outpos)++] = (char)res;
7305 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007306 }
7307
7308 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007311 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 Py_DECREF(rep);
7313 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007314 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 if (PyLong_Check(rep)) {
7316 Py_ssize_t requiredsize = *outpos+1;
7317 if (outsize<requiredsize)
7318 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7319 Py_DECREF(rep);
7320 return enc_EXCEPTION;
7321 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007322 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 else {
7326 const char *repchars = PyBytes_AS_STRING(rep);
7327 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7328 Py_ssize_t requiredsize = *outpos+repsize;
7329 if (outsize<requiredsize)
7330 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7331 Py_DECREF(rep);
7332 return enc_EXCEPTION;
7333 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007334 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 memcpy(outstart + *outpos, repchars, repsize);
7336 *outpos += repsize;
7337 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007339 Py_DECREF(rep);
7340 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341}
7342
7343/* handle an error in PyUnicode_EncodeCharmap
7344 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007345static int
7346charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007349 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007350 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351{
7352 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007353 Py_ssize_t repsize;
7354 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355 Py_UNICODE *uni2;
7356 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 Py_ssize_t collstartpos = *inpos;
7358 Py_ssize_t collendpos = *inpos+1;
7359 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360 char *encoding = "charmap";
7361 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007362 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 /* find all unencodable characters */
7365 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007366 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007367 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 int res = encoding_map_lookup(p[collendpos], mapping);
7369 if (res != -1)
7370 break;
7371 ++collendpos;
7372 continue;
7373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 rep = charmapencode_lookup(p[collendpos], mapping);
7376 if (rep==NULL)
7377 return -1;
7378 else if (rep!=Py_None) {
7379 Py_DECREF(rep);
7380 break;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384 }
7385 /* cache callback name lookup
7386 * (if not done yet, i.e. it's the first error) */
7387 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 if ((errors==NULL) || (!strcmp(errors, "strict")))
7389 *known_errorHandler = 1;
7390 else if (!strcmp(errors, "replace"))
7391 *known_errorHandler = 2;
7392 else if (!strcmp(errors, "ignore"))
7393 *known_errorHandler = 3;
7394 else if (!strcmp(errors, "xmlcharrefreplace"))
7395 *known_errorHandler = 4;
7396 else
7397 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007398 }
7399 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007400 case 1: /* strict */
7401 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7402 return -1;
7403 case 2: /* replace */
7404 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 x = charmapencode_output('?', mapping, res, respos);
7406 if (x==enc_EXCEPTION) {
7407 return -1;
7408 }
7409 else if (x==enc_FAILED) {
7410 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7411 return -1;
7412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007413 }
7414 /* fall through */
7415 case 3: /* ignore */
7416 *inpos = collendpos;
7417 break;
7418 case 4: /* xmlcharrefreplace */
7419 /* generate replacement (temporarily (mis)uses p) */
7420 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 char buffer[2+29+1+1];
7422 char *cp;
7423 sprintf(buffer, "&#%d;", (int)p[collpos]);
7424 for (cp = buffer; *cp; ++cp) {
7425 x = charmapencode_output(*cp, mapping, res, respos);
7426 if (x==enc_EXCEPTION)
7427 return -1;
7428 else if (x==enc_FAILED) {
7429 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7430 return -1;
7431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007432 }
7433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007434 *inpos = collendpos;
7435 break;
7436 default:
7437 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 encoding, reason, p, size, exceptionObject,
7439 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007440 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007442 if (PyBytes_Check(repunicode)) {
7443 /* Directly copy bytes result to output. */
7444 Py_ssize_t outsize = PyBytes_Size(*res);
7445 Py_ssize_t requiredsize;
7446 repsize = PyBytes_Size(repunicode);
7447 requiredsize = *respos + repsize;
7448 if (requiredsize > outsize)
7449 /* Make room for all additional bytes. */
7450 if (charmapencode_resize(res, respos, requiredsize)) {
7451 Py_DECREF(repunicode);
7452 return -1;
7453 }
7454 memcpy(PyBytes_AsString(*res) + *respos,
7455 PyBytes_AsString(repunicode), repsize);
7456 *respos += repsize;
7457 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007458 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007459 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007460 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007461 /* generate replacement */
7462 repsize = PyUnicode_GET_SIZE(repunicode);
7463 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 x = charmapencode_output(*uni2, mapping, res, respos);
7465 if (x==enc_EXCEPTION) {
7466 return -1;
7467 }
7468 else if (x==enc_FAILED) {
7469 Py_DECREF(repunicode);
7470 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7471 return -1;
7472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 }
7474 *inpos = newpos;
7475 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007476 }
7477 return 0;
7478}
7479
Alexander Belopolsky40018472011-02-26 01:02:56 +00007480PyObject *
7481PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7482 Py_ssize_t size,
7483 PyObject *mapping,
7484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 /* output object */
7487 PyObject *res = NULL;
7488 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007489 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007490 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007491 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492 PyObject *errorHandler = NULL;
7493 PyObject *exc = NULL;
7494 /* the following variable is used for caching string comparisons
7495 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7496 * 3=ignore, 4=xmlcharrefreplace */
7497 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
7499 /* Default to Latin-1 */
7500 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 /* allocate enough for a simple encoding without
7504 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007505 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506 if (res == NULL)
7507 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007508 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 /* try to encode it */
7513 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7514 if (x==enc_EXCEPTION) /* error */
7515 goto onError;
7516 if (x==enc_FAILED) { /* unencodable character */
7517 if (charmap_encoding_error(p, size, &inpos, mapping,
7518 &exc,
7519 &known_errorHandler, &errorHandler, errors,
7520 &res, &respos)) {
7521 goto onError;
7522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007523 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 else
7525 /* done with this character => adjust input position */
7526 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007530 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007531 if (_PyBytes_Resize(&res, respos) < 0)
7532 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 Py_XDECREF(exc);
7535 Py_XDECREF(errorHandler);
7536 return res;
7537
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 Py_XDECREF(res);
7540 Py_XDECREF(exc);
7541 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 return NULL;
7543}
7544
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545PyObject *
7546PyUnicode_AsCharmapString(PyObject *unicode,
7547 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
7549 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 PyErr_BadArgument();
7551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 }
7553 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 PyUnicode_GET_SIZE(unicode),
7555 mapping,
7556 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007560static void
7561make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007563 Py_ssize_t startpos, Py_ssize_t endpos,
7564 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 *exceptionObject = _PyUnicodeTranslateError_Create(
7568 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 }
7570 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7572 goto onError;
7573 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7574 goto onError;
7575 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7576 goto onError;
7577 return;
7578 onError:
7579 Py_DECREF(*exceptionObject);
7580 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 }
7582}
7583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007584/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585static void
7586raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007587 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007588 Py_ssize_t startpos, Py_ssize_t endpos,
7589 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007590{
7591 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007592 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595}
7596
7597/* error handling callback helper:
7598 build arguments, call the callback and check the arguments,
7599 put the result into newpos and return the replacement string, which
7600 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007601static PyObject *
7602unicode_translate_call_errorhandler(const char *errors,
7603 PyObject **errorHandler,
7604 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007605 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007606 Py_ssize_t startpos, Py_ssize_t endpos,
7607 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007609 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007610
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007611 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612 PyObject *restuple;
7613 PyObject *resunicode;
7614
7615 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619 }
7620
7621 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007622 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007623 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007625
7626 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007631 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 Py_DECREF(restuple);
7633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 }
7635 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 &resunicode, &i_newpos)) {
7637 Py_DECREF(restuple);
7638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007640 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007641 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007642 else
7643 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007644 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7646 Py_DECREF(restuple);
7647 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649 Py_INCREF(resunicode);
7650 Py_DECREF(restuple);
7651 return resunicode;
7652}
7653
7654/* Lookup the character ch in the mapping and put the result in result,
7655 which must be decrefed by the caller.
7656 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007657static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007658charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007659{
Christian Heimes217cfd12007-12-02 14:31:20 +00007660 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 PyObject *x;
7662
7663 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 x = PyObject_GetItem(mapping, w);
7666 Py_DECREF(w);
7667 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7669 /* No mapping found means: use 1:1 mapping. */
7670 PyErr_Clear();
7671 *result = NULL;
7672 return 0;
7673 } else
7674 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675 }
7676 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 *result = x;
7678 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007680 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 long value = PyLong_AS_LONG(x);
7682 long max = PyUnicode_GetMax();
7683 if (value < 0 || value > max) {
7684 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007685 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 Py_DECREF(x);
7687 return -1;
7688 }
7689 *result = x;
7690 return 0;
7691 }
7692 else if (PyUnicode_Check(x)) {
7693 *result = x;
7694 return 0;
7695 }
7696 else {
7697 /* wrong return value */
7698 PyErr_SetString(PyExc_TypeError,
7699 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007700 Py_DECREF(x);
7701 return -1;
7702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703}
7704/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 if not reallocate and adjust various state variables.
7706 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007711 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007712 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 /* exponentially overallocate to minimize reallocations */
7714 if (requiredsize < 2 * oldsize)
7715 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007716 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7717 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007719 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007720 }
7721 return 0;
7722}
7723/* lookup the character, put the result in the output string and adjust
7724 various state variables. Return a new reference to the object that
7725 was put in the output buffer in *result, or Py_None, if the mapping was
7726 undefined (in which case no character was written).
7727 The called must decref result.
7728 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007729static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7731 PyObject *mapping, Py_UCS4 **output,
7732 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007733 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7736 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007740 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 }
7742 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007744 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007747 }
7748 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007749 Py_ssize_t repsize;
7750 if (PyUnicode_READY(*res) == -1)
7751 return -1;
7752 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 if (repsize==1) {
7754 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007755 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 }
7757 else if (repsize!=0) {
7758 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007759 Py_ssize_t requiredsize = *opos +
7760 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007762 Py_ssize_t i;
7763 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007765 for(i = 0; i < repsize; i++)
7766 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007768 }
7769 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007771 return 0;
7772}
7773
Alexander Belopolsky40018472011-02-26 01:02:56 +00007774PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007775_PyUnicode_TranslateCharmap(PyObject *input,
7776 PyObject *mapping,
7777 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007779 /* input object */
7780 char *idata;
7781 Py_ssize_t size, i;
7782 int kind;
7783 /* output buffer */
7784 Py_UCS4 *output = NULL;
7785 Py_ssize_t osize;
7786 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 char *reason = "character maps to <undefined>";
7790 PyObject *errorHandler = NULL;
7791 PyObject *exc = NULL;
7792 /* the following variable is used for caching string comparisons
7793 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7794 * 3=ignore, 4=xmlcharrefreplace */
7795 int known_errorHandler = -1;
7796
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 PyErr_BadArgument();
7799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 if (PyUnicode_READY(input) == -1)
7803 return NULL;
7804 idata = (char*)PyUnicode_DATA(input);
7805 kind = PyUnicode_KIND(input);
7806 size = PyUnicode_GET_LENGTH(input);
7807 i = 0;
7808
7809 if (size == 0) {
7810 Py_INCREF(input);
7811 return input;
7812 }
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814 /* allocate enough for a simple 1:1 translation without
7815 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 osize = size;
7817 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7818 opos = 0;
7819 if (output == NULL) {
7820 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 /* try to encode it */
7826 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 if (charmaptranslate_output(input, i, mapping,
7828 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 Py_XDECREF(x);
7830 goto onError;
7831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 else { /* untranslatable character */
7836 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7837 Py_ssize_t repsize;
7838 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841 Py_ssize_t collstart = i;
7842 Py_ssize_t collend = i+1;
7843 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 while (collend < size) {
7847 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 goto onError;
7849 Py_XDECREF(x);
7850 if (x!=Py_None)
7851 break;
7852 ++collend;
7853 }
7854 /* cache callback name lookup
7855 * (if not done yet, i.e. it's the first error) */
7856 if (known_errorHandler==-1) {
7857 if ((errors==NULL) || (!strcmp(errors, "strict")))
7858 known_errorHandler = 1;
7859 else if (!strcmp(errors, "replace"))
7860 known_errorHandler = 2;
7861 else if (!strcmp(errors, "ignore"))
7862 known_errorHandler = 3;
7863 else if (!strcmp(errors, "xmlcharrefreplace"))
7864 known_errorHandler = 4;
7865 else
7866 known_errorHandler = 0;
7867 }
7868 switch (known_errorHandler) {
7869 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870 raise_translate_exception(&exc, input, collstart,
7871 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 case 2: /* replace */
7874 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007875 for (coll = collstart; coll<collend; coll++)
7876 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 /* fall through */
7878 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 break;
7881 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882 /* generate replacement (temporarily (mis)uses i) */
7883 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 char buffer[2+29+1+1];
7885 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7887 if (charmaptranslate_makespace(&output, &osize,
7888 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 goto onError;
7890 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 break;
7895 default:
7896 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 reason, input, &exc,
7898 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007899 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 goto onError;
7901 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 repsize = PyUnicode_GET_LENGTH(repunicode);
7903 if (charmaptranslate_makespace(&output, &osize,
7904 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(repunicode);
7906 goto onError;
7907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 for (uni2 = 0; repsize-->0; ++uni2)
7909 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7910 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007913 }
7914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7916 if (!res)
7917 goto onError;
7918 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007919 Py_XDECREF(exc);
7920 Py_XDECREF(errorHandler);
7921 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007925 Py_XDECREF(exc);
7926 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 return NULL;
7928}
7929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930/* Deprecated. Use PyUnicode_Translate instead. */
7931PyObject *
7932PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7933 Py_ssize_t size,
7934 PyObject *mapping,
7935 const char *errors)
7936{
7937 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7938 if (!unicode)
7939 return NULL;
7940 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7941}
7942
Alexander Belopolsky40018472011-02-26 01:02:56 +00007943PyObject *
7944PyUnicode_Translate(PyObject *str,
7945 PyObject *mapping,
7946 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947{
7948 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007949
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 str = PyUnicode_FromObject(str);
7951 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 Py_DECREF(str);
7955 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007956
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 Py_XDECREF(str);
7959 return NULL;
7960}
Tim Petersced69f82003-09-16 20:30:58 +00007961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962static Py_UCS4
7963fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7964{
7965 /* No need to call PyUnicode_READY(self) because this function is only
7966 called as a callback from fixup() which does it already. */
7967 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7968 const int kind = PyUnicode_KIND(self);
7969 void *data = PyUnicode_DATA(self);
7970 Py_UCS4 maxchar = 0, ch, fixed;
7971 Py_ssize_t i;
7972
7973 for (i = 0; i < len; ++i) {
7974 ch = PyUnicode_READ(kind, data, i);
7975 fixed = 0;
7976 if (ch > 127) {
7977 if (Py_UNICODE_ISSPACE(ch))
7978 fixed = ' ';
7979 else {
7980 const int decimal = Py_UNICODE_TODECIMAL(ch);
7981 if (decimal >= 0)
7982 fixed = '0' + decimal;
7983 }
7984 if (fixed != 0) {
7985 if (fixed > maxchar)
7986 maxchar = fixed;
7987 PyUnicode_WRITE(kind, data, i, fixed);
7988 }
7989 else if (ch > maxchar)
7990 maxchar = ch;
7991 }
7992 else if (ch > maxchar)
7993 maxchar = ch;
7994 }
7995
7996 return maxchar;
7997}
7998
7999PyObject *
8000_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8001{
8002 if (!PyUnicode_Check(unicode)) {
8003 PyErr_BadInternalCall();
8004 return NULL;
8005 }
8006 if (PyUnicode_READY(unicode) == -1)
8007 return NULL;
8008 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8009 /* If the string is already ASCII, just return the same string */
8010 Py_INCREF(unicode);
8011 return unicode;
8012 }
8013 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8014}
8015
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008016PyObject *
8017PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8018 Py_ssize_t length)
8019{
8020 PyObject *result;
8021 Py_UNICODE *p; /* write pointer into result */
8022 Py_ssize_t i;
8023 /* Copy to a new string */
8024 result = (PyObject *)_PyUnicode_New(length);
8025 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8026 if (result == NULL)
8027 return result;
8028 p = PyUnicode_AS_UNICODE(result);
8029 /* Iterate over code points */
8030 for (i = 0; i < length; i++) {
8031 Py_UNICODE ch =s[i];
8032 if (ch > 127) {
8033 int decimal = Py_UNICODE_TODECIMAL(ch);
8034 if (decimal >= 0)
8035 p[i] = '0' + decimal;
8036 }
8037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8039 Py_DECREF(result);
8040 return NULL;
8041 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008042 return result;
8043}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008044/* --- Decimal Encoder ---------------------------------------------------- */
8045
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046int
8047PyUnicode_EncodeDecimal(Py_UNICODE *s,
8048 Py_ssize_t length,
8049 char *output,
8050 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008051{
8052 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 PyObject *errorHandler = NULL;
8054 PyObject *exc = NULL;
8055 const char *encoding = "decimal";
8056 const char *reason = "invalid decimal Unicode string";
8057 /* the following variable is used for caching string comparisons
8058 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8059 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008060
8061 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 PyErr_BadArgument();
8063 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008064 }
8065
8066 p = s;
8067 end = s + length;
8068 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 register Py_UNICODE ch = *p;
8070 int decimal;
8071 PyObject *repunicode;
8072 Py_ssize_t repsize;
8073 Py_ssize_t newpos;
8074 Py_UNICODE *uni2;
8075 Py_UNICODE *collstart;
8076 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 ++p;
8081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 decimal = Py_UNICODE_TODECIMAL(ch);
8084 if (decimal >= 0) {
8085 *output++ = '0' + decimal;
8086 ++p;
8087 continue;
8088 }
8089 if (0 < ch && ch < 256) {
8090 *output++ = (char)ch;
8091 ++p;
8092 continue;
8093 }
8094 /* All other characters are considered unencodable */
8095 collstart = p;
8096 collend = p+1;
8097 while (collend < end) {
8098 if ((0 < *collend && *collend < 256) ||
8099 !Py_UNICODE_ISSPACE(*collend) ||
8100 Py_UNICODE_TODECIMAL(*collend))
8101 break;
8102 }
8103 /* cache callback name lookup
8104 * (if not done yet, i.e. it's the first error) */
8105 if (known_errorHandler==-1) {
8106 if ((errors==NULL) || (!strcmp(errors, "strict")))
8107 known_errorHandler = 1;
8108 else if (!strcmp(errors, "replace"))
8109 known_errorHandler = 2;
8110 else if (!strcmp(errors, "ignore"))
8111 known_errorHandler = 3;
8112 else if (!strcmp(errors, "xmlcharrefreplace"))
8113 known_errorHandler = 4;
8114 else
8115 known_errorHandler = 0;
8116 }
8117 switch (known_errorHandler) {
8118 case 1: /* strict */
8119 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8120 goto onError;
8121 case 2: /* replace */
8122 for (p = collstart; p < collend; ++p)
8123 *output++ = '?';
8124 /* fall through */
8125 case 3: /* ignore */
8126 p = collend;
8127 break;
8128 case 4: /* xmlcharrefreplace */
8129 /* generate replacement (temporarily (mis)uses p) */
8130 for (p = collstart; p < collend; ++p)
8131 output += sprintf(output, "&#%d;", (int)*p);
8132 p = collend;
8133 break;
8134 default:
8135 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8136 encoding, reason, s, length, &exc,
8137 collstart-s, collend-s, &newpos);
8138 if (repunicode == NULL)
8139 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008140 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008141 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008142 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8143 Py_DECREF(repunicode);
8144 goto onError;
8145 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 /* generate replacement */
8147 repsize = PyUnicode_GET_SIZE(repunicode);
8148 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8149 Py_UNICODE ch = *uni2;
8150 if (Py_UNICODE_ISSPACE(ch))
8151 *output++ = ' ';
8152 else {
8153 decimal = Py_UNICODE_TODECIMAL(ch);
8154 if (decimal >= 0)
8155 *output++ = '0' + decimal;
8156 else if (0 < ch && ch < 256)
8157 *output++ = (char)ch;
8158 else {
8159 Py_DECREF(repunicode);
8160 raise_encode_exception(&exc, encoding,
8161 s, length, collstart-s, collend-s, reason);
8162 goto onError;
8163 }
8164 }
8165 }
8166 p = s + newpos;
8167 Py_DECREF(repunicode);
8168 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008169 }
8170 /* 0-terminate the output string */
8171 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172 Py_XDECREF(exc);
8173 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008174 return 0;
8175
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008177 Py_XDECREF(exc);
8178 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008179 return -1;
8180}
8181
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182/* --- Helpers ------------------------------------------------------------ */
8183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184#include "stringlib/ucs1lib.h"
8185#include "stringlib/fastsearch.h"
8186#include "stringlib/partition.h"
8187#include "stringlib/split.h"
8188#include "stringlib/count.h"
8189#include "stringlib/find.h"
8190#include "stringlib/localeutil.h"
8191#include "stringlib/undef.h"
8192
8193#include "stringlib/ucs2lib.h"
8194#include "stringlib/fastsearch.h"
8195#include "stringlib/partition.h"
8196#include "stringlib/split.h"
8197#include "stringlib/count.h"
8198#include "stringlib/find.h"
8199#include "stringlib/localeutil.h"
8200#include "stringlib/undef.h"
8201
8202#include "stringlib/ucs4lib.h"
8203#include "stringlib/fastsearch.h"
8204#include "stringlib/partition.h"
8205#include "stringlib/split.h"
8206#include "stringlib/count.h"
8207#include "stringlib/find.h"
8208#include "stringlib/localeutil.h"
8209#include "stringlib/undef.h"
8210
8211static Py_ssize_t
8212any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8213 const Py_UCS1*, Py_ssize_t,
8214 Py_ssize_t, Py_ssize_t),
8215 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8216 const Py_UCS2*, Py_ssize_t,
8217 Py_ssize_t, Py_ssize_t),
8218 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8219 const Py_UCS4*, Py_ssize_t,
8220 Py_ssize_t, Py_ssize_t),
8221 PyObject* s1, PyObject* s2,
8222 Py_ssize_t start,
8223 Py_ssize_t end)
8224{
8225 int kind1, kind2, kind;
8226 void *buf1, *buf2;
8227 Py_ssize_t len1, len2, result;
8228
8229 kind1 = PyUnicode_KIND(s1);
8230 kind2 = PyUnicode_KIND(s2);
8231 kind = kind1 > kind2 ? kind1 : kind2;
8232 buf1 = PyUnicode_DATA(s1);
8233 buf2 = PyUnicode_DATA(s2);
8234 if (kind1 != kind)
8235 buf1 = _PyUnicode_AsKind(s1, kind);
8236 if (!buf1)
8237 return -2;
8238 if (kind2 != kind)
8239 buf2 = _PyUnicode_AsKind(s2, kind);
8240 if (!buf2) {
8241 if (kind1 != kind) PyMem_Free(buf1);
8242 return -2;
8243 }
8244 len1 = PyUnicode_GET_LENGTH(s1);
8245 len2 = PyUnicode_GET_LENGTH(s2);
8246
8247 switch(kind) {
8248 case PyUnicode_1BYTE_KIND:
8249 result = ucs1(buf1, len1, buf2, len2, start, end);
8250 break;
8251 case PyUnicode_2BYTE_KIND:
8252 result = ucs2(buf1, len1, buf2, len2, start, end);
8253 break;
8254 case PyUnicode_4BYTE_KIND:
8255 result = ucs4(buf1, len1, buf2, len2, start, end);
8256 break;
8257 default:
8258 assert(0); result = -2;
8259 }
8260
8261 if (kind1 != kind)
8262 PyMem_Free(buf1);
8263 if (kind2 != kind)
8264 PyMem_Free(buf2);
8265
8266 return result;
8267}
8268
8269Py_ssize_t
8270_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8271 Py_ssize_t n_buffer,
8272 void *digits, Py_ssize_t n_digits,
8273 Py_ssize_t min_width,
8274 const char *grouping,
8275 const char *thousands_sep)
8276{
8277 switch(kind) {
8278 case PyUnicode_1BYTE_KIND:
8279 return _PyUnicode_ucs1_InsertThousandsGrouping(
8280 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8281 min_width, grouping, thousands_sep);
8282 case PyUnicode_2BYTE_KIND:
8283 return _PyUnicode_ucs2_InsertThousandsGrouping(
8284 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8285 min_width, grouping, thousands_sep);
8286 case PyUnicode_4BYTE_KIND:
8287 return _PyUnicode_ucs4_InsertThousandsGrouping(
8288 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8289 min_width, grouping, thousands_sep);
8290 }
8291 assert(0);
8292 return -1;
8293}
8294
8295
Eric Smith8c663262007-08-25 02:26:07 +00008296#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008297#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008298
Thomas Wouters477c8d52006-05-27 19:21:47 +00008299#include "stringlib/count.h"
8300#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008301
Thomas Wouters477c8d52006-05-27 19:21:47 +00008302/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008303#define ADJUST_INDICES(start, end, len) \
8304 if (end > len) \
8305 end = len; \
8306 else if (end < 0) { \
8307 end += len; \
8308 if (end < 0) \
8309 end = 0; \
8310 } \
8311 if (start < 0) { \
8312 start += len; \
8313 if (start < 0) \
8314 start = 0; \
8315 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008316
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317Py_ssize_t
8318PyUnicode_Count(PyObject *str,
8319 PyObject *substr,
8320 Py_ssize_t start,
8321 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008323 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324 PyUnicodeObject* str_obj;
8325 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 int kind1, kind2, kind;
8327 void *buf1 = NULL, *buf2 = NULL;
8328 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008329
Thomas Wouters477c8d52006-05-27 19:21:47 +00008330 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008333 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008334 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 Py_DECREF(str_obj);
8336 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
Tim Petersced69f82003-09-16 20:30:58 +00008338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 kind1 = PyUnicode_KIND(str_obj);
8340 kind2 = PyUnicode_KIND(sub_obj);
8341 kind = kind1 > kind2 ? kind1 : kind2;
8342 buf1 = PyUnicode_DATA(str_obj);
8343 if (kind1 != kind)
8344 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8345 if (!buf1)
8346 goto onError;
8347 buf2 = PyUnicode_DATA(sub_obj);
8348 if (kind2 != kind)
8349 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8350 if (!buf2)
8351 goto onError;
8352 len1 = PyUnicode_GET_LENGTH(str_obj);
8353 len2 = PyUnicode_GET_LENGTH(sub_obj);
8354
8355 ADJUST_INDICES(start, end, len1);
8356 switch(kind) {
8357 case PyUnicode_1BYTE_KIND:
8358 result = ucs1lib_count(
8359 ((Py_UCS1*)buf1) + start, end - start,
8360 buf2, len2, PY_SSIZE_T_MAX
8361 );
8362 break;
8363 case PyUnicode_2BYTE_KIND:
8364 result = ucs2lib_count(
8365 ((Py_UCS2*)buf1) + start, end - start,
8366 buf2, len2, PY_SSIZE_T_MAX
8367 );
8368 break;
8369 case PyUnicode_4BYTE_KIND:
8370 result = ucs4lib_count(
8371 ((Py_UCS4*)buf1) + start, end - start,
8372 buf2, len2, PY_SSIZE_T_MAX
8373 );
8374 break;
8375 default:
8376 assert(0); result = 0;
8377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378
8379 Py_DECREF(sub_obj);
8380 Py_DECREF(str_obj);
8381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 if (kind1 != kind)
8383 PyMem_Free(buf1);
8384 if (kind2 != kind)
8385 PyMem_Free(buf2);
8386
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 onError:
8389 Py_DECREF(sub_obj);
8390 Py_DECREF(str_obj);
8391 if (kind1 != kind && buf1)
8392 PyMem_Free(buf1);
8393 if (kind2 != kind && buf2)
8394 PyMem_Free(buf2);
8395 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396}
8397
Alexander Belopolsky40018472011-02-26 01:02:56 +00008398Py_ssize_t
8399PyUnicode_Find(PyObject *str,
8400 PyObject *sub,
8401 Py_ssize_t start,
8402 Py_ssize_t end,
8403 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008406
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008410 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 Py_DECREF(str);
8413 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 }
Tim Petersced69f82003-09-16 20:30:58 +00008415
Thomas Wouters477c8d52006-05-27 19:21:47 +00008416 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 result = any_find_slice(
8418 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8419 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008420 );
8421 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 result = any_find_slice(
8423 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8424 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425 );
8426
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008428 Py_DECREF(sub);
8429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 return result;
8431}
8432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433Py_ssize_t
8434PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8435 Py_ssize_t start, Py_ssize_t end,
8436 int direction)
8437{
8438 char *result;
8439 int kind;
8440 if (PyUnicode_READY(str) == -1)
8441 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008442 if (start < 0 || end < 0) {
8443 PyErr_SetString(PyExc_IndexError, "string index out of range");
8444 return -2;
8445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 if (end > PyUnicode_GET_LENGTH(str))
8447 end = PyUnicode_GET_LENGTH(str);
8448 kind = PyUnicode_KIND(str);
8449 result = findchar(PyUnicode_1BYTE_DATA(str)
8450 + PyUnicode_KIND_SIZE(kind, start),
8451 kind,
8452 end-start, ch, direction);
8453 if (!result)
8454 return -1;
8455 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8456}
8457
Alexander Belopolsky40018472011-02-26 01:02:56 +00008458static int
8459tailmatch(PyUnicodeObject *self,
8460 PyUnicodeObject *substring,
8461 Py_ssize_t start,
8462 Py_ssize_t end,
8463 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 int kind_self;
8466 int kind_sub;
8467 void *data_self;
8468 void *data_sub;
8469 Py_ssize_t offset;
8470 Py_ssize_t i;
8471 Py_ssize_t end_sub;
8472
8473 if (PyUnicode_READY(self) == -1 ||
8474 PyUnicode_READY(substring) == -1)
8475 return 0;
8476
8477 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 return 1;
8479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8481 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 kind_self = PyUnicode_KIND(self);
8486 data_self = PyUnicode_DATA(self);
8487 kind_sub = PyUnicode_KIND(substring);
8488 data_sub = PyUnicode_DATA(substring);
8489 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8490
8491 if (direction > 0)
8492 offset = end;
8493 else
8494 offset = start;
8495
8496 if (PyUnicode_READ(kind_self, data_self, offset) ==
8497 PyUnicode_READ(kind_sub, data_sub, 0) &&
8498 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8499 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8500 /* If both are of the same kind, memcmp is sufficient */
8501 if (kind_self == kind_sub) {
8502 return ! memcmp((char *)data_self +
8503 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8504 data_sub,
8505 PyUnicode_GET_LENGTH(substring) *
8506 PyUnicode_CHARACTER_SIZE(substring));
8507 }
8508 /* otherwise we have to compare each character by first accesing it */
8509 else {
8510 /* We do not need to compare 0 and len(substring)-1 because
8511 the if statement above ensured already that they are equal
8512 when we end up here. */
8513 // TODO: honor direction and do a forward or backwards search
8514 for (i = 1; i < end_sub; ++i) {
8515 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8516 PyUnicode_READ(kind_sub, data_sub, i))
8517 return 0;
8518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 }
8522
8523 return 0;
8524}
8525
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526Py_ssize_t
8527PyUnicode_Tailmatch(PyObject *str,
8528 PyObject *substr,
8529 Py_ssize_t start,
8530 Py_ssize_t end,
8531 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008533 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008534
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 str = PyUnicode_FromObject(str);
8536 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 substr = PyUnicode_FromObject(substr);
8539 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 Py_DECREF(str);
8541 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 }
Tim Petersced69f82003-09-16 20:30:58 +00008543
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 (PyUnicodeObject *)substr,
8546 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 Py_DECREF(str);
8548 Py_DECREF(substr);
8549 return result;
8550}
8551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552/* Apply fixfct filter to the Unicode object self and return a
8553 reference to the modified object */
8554
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555static PyObject *
8556fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 PyObject *u;
8560 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 if (PyUnicode_READY(self) == -1)
8563 return NULL;
8564 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8565 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8566 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8571 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 /* fix functions return the new maximum character in a string,
8574 if the kind of the resulting unicode object does not change,
8575 everything is fine. Otherwise we need to change the string kind
8576 and re-run the fix function. */
8577 maxchar_new = fixfct((PyUnicodeObject*)u);
8578 if (maxchar_new == 0)
8579 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8580 else if (maxchar_new <= 127)
8581 maxchar_new = 127;
8582 else if (maxchar_new <= 255)
8583 maxchar_new = 255;
8584 else if (maxchar_new <= 65535)
8585 maxchar_new = 65535;
8586 else
8587 maxchar_new = 1114111; /* 0x10ffff */
8588
8589 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 /* fixfct should return TRUE if it modified the buffer. If
8591 FALSE, return a reference to the original buffer instead
8592 (to save space, not time) */
8593 Py_INCREF(self);
8594 Py_DECREF(u);
8595 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 else if (maxchar_new == maxchar_old) {
8598 return u;
8599 }
8600 else {
8601 /* In case the maximum character changed, we need to
8602 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008603 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 if (v == NULL) {
8605 Py_DECREF(u);
8606 return NULL;
8607 }
8608 if (maxchar_new > maxchar_old) {
8609 /* If the maxchar increased so that the kind changed, not all
8610 characters are representable anymore and we need to fix the
8611 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008612 if (PyUnicode_CopyCharacters(v, 0,
8613 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008614 PyUnicode_GET_LENGTH(self)) < 0)
8615 {
8616 Py_DECREF(u);
8617 return NULL;
8618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 maxchar_old = fixfct((PyUnicodeObject*)v);
8620 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8621 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008622 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008623 if (PyUnicode_CopyCharacters(v, 0,
8624 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008625 PyUnicode_GET_LENGTH(self)) < 0)
8626 {
8627 Py_DECREF(u);
8628 return NULL;
8629 }
8630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631
8632 Py_DECREF(u);
8633 return v;
8634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635}
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 /* No need to call PyUnicode_READY(self) because this function is only
8641 called as a callback from fixup() which does it already. */
8642 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8643 const int kind = PyUnicode_KIND(self);
8644 void *data = PyUnicode_DATA(self);
8645 int touched = 0;
8646 Py_UCS4 maxchar = 0;
8647 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 for (i = 0; i < len; ++i) {
8650 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8651 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8652 if (up != ch) {
8653 if (up > maxchar)
8654 maxchar = up;
8655 PyUnicode_WRITE(kind, data, i, up);
8656 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 else if (ch > maxchar)
8659 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 }
8661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 if (touched)
8663 return maxchar;
8664 else
8665 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666}
8667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008669fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8672 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8673 const int kind = PyUnicode_KIND(self);
8674 void *data = PyUnicode_DATA(self);
8675 int touched = 0;
8676 Py_UCS4 maxchar = 0;
8677 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 for(i = 0; i < len; ++i) {
8680 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8681 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8682 if (lo != ch) {
8683 if (lo > maxchar)
8684 maxchar = lo;
8685 PyUnicode_WRITE(kind, data, i, lo);
8686 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 else if (ch > maxchar)
8689 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 }
8691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 if (touched)
8693 return maxchar;
8694 else
8695 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696}
8697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8702 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8703 const int kind = PyUnicode_KIND(self);
8704 void *data = PyUnicode_DATA(self);
8705 int touched = 0;
8706 Py_UCS4 maxchar = 0;
8707 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 for(i = 0; i < len; ++i) {
8710 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8711 Py_UCS4 nu = 0;
8712
8713 if (Py_UNICODE_ISUPPER(ch))
8714 nu = Py_UNICODE_TOLOWER(ch);
8715 else if (Py_UNICODE_ISLOWER(ch))
8716 nu = Py_UNICODE_TOUPPER(ch);
8717
8718 if (nu != 0) {
8719 if (nu > maxchar)
8720 maxchar = nu;
8721 PyUnicode_WRITE(kind, data, i, nu);
8722 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 else if (ch > maxchar)
8725 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 }
8727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 if (touched)
8729 return maxchar;
8730 else
8731 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732}
8733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008735fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8738 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8739 const int kind = PyUnicode_KIND(self);
8740 void *data = PyUnicode_DATA(self);
8741 int touched = 0;
8742 Py_UCS4 maxchar = 0;
8743 Py_ssize_t i = 0;
8744 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008745
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008746 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748
8749 ch = PyUnicode_READ(kind, data, i);
8750 if (!Py_UNICODE_ISUPPER(ch)) {
8751 maxchar = Py_UNICODE_TOUPPER(ch);
8752 PyUnicode_WRITE(kind, data, i, maxchar);
8753 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 ++i;
8756 for(; i < len; ++i) {
8757 ch = PyUnicode_READ(kind, data, i);
8758 if (!Py_UNICODE_ISLOWER(ch)) {
8759 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8760 if (lo > maxchar)
8761 maxchar = lo;
8762 PyUnicode_WRITE(kind, data, i, lo);
8763 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 else if (ch > maxchar)
8766 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768
8769 if (touched)
8770 return maxchar;
8771 else
8772 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773}
8774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008776fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8779 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8780 const int kind = PyUnicode_KIND(self);
8781 void *data = PyUnicode_DATA(self);
8782 Py_UCS4 maxchar = 0;
8783 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 int previous_is_cased;
8785
8786 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 if (len == 1) {
8788 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8789 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8790 if (ti != ch) {
8791 PyUnicode_WRITE(kind, data, i, ti);
8792 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 }
8794 else
8795 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 for(; i < len; ++i) {
8799 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8800 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008801
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 nu = Py_UNICODE_TOTITLE(ch);
8806
8807 if (nu > maxchar)
8808 maxchar = nu;
8809 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008810
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 if (Py_UNICODE_ISLOWER(ch) ||
8812 Py_UNICODE_ISUPPER(ch) ||
8813 Py_UNICODE_ISTITLE(ch))
8814 previous_is_cased = 1;
8815 else
8816 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819}
8820
Tim Peters8ce9f162004-08-27 01:49:32 +00008821PyObject *
8822PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008825 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008827 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008828 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8829 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008830 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 Py_ssize_t sz, i, res_offset;
8832 Py_UCS4 maxchar = 0;
8833 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834
Tim Peters05eba1f2004-08-27 21:32:02 +00008835 fseq = PySequence_Fast(seq, "");
8836 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008837 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008838 }
8839
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008840 /* NOTE: the following code can't call back into Python code,
8841 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008842 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008843
Tim Peters05eba1f2004-08-27 21:32:02 +00008844 seqlen = PySequence_Fast_GET_SIZE(fseq);
8845 /* If empty sequence, return u"". */
8846 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008848 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008850 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008851 /* If singleton sequence with an exact Unicode, return that. */
8852 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 item = items[0];
8854 if (PyUnicode_CheckExact(item)) {
8855 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 goto Done;
8858 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008859 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008860 else {
8861 /* Set up sep and seplen */
8862 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 /* fall back to a blank space separator */
8864 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008865 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008867 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008868 else {
8869 if (!PyUnicode_Check(separator)) {
8870 PyErr_Format(PyExc_TypeError,
8871 "separator: expected str instance,"
8872 " %.80s found",
8873 Py_TYPE(separator)->tp_name);
8874 goto onError;
8875 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008876 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 goto onError;
8878 sep = separator;
8879 seplen = PyUnicode_GET_LENGTH(separator);
8880 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8881 /* inc refcount to keep this code path symetric with the
8882 above case of a blank separator */
8883 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008884 }
8885 }
8886
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008887 /* There are at least two things to join, or else we have a subclass
8888 * of str in the sequence.
8889 * Do a pre-pass to figure out the total amount of space we'll
8890 * need (sz), and see whether all argument are strings.
8891 */
8892 sz = 0;
8893 for (i = 0; i < seqlen; i++) {
8894 const Py_ssize_t old_sz = sz;
8895 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 if (!PyUnicode_Check(item)) {
8897 PyErr_Format(PyExc_TypeError,
8898 "sequence item %zd: expected str instance,"
8899 " %.80s found",
8900 i, Py_TYPE(item)->tp_name);
8901 goto onError;
8902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (PyUnicode_READY(item) == -1)
8904 goto onError;
8905 sz += PyUnicode_GET_LENGTH(item);
8906 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8907 if (item_maxchar > maxchar)
8908 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008909 if (i != 0)
8910 sz += seplen;
8911 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8912 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008914 goto onError;
8915 }
8916 }
Tim Petersced69f82003-09-16 20:30:58 +00008917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008919 if (res == NULL)
8920 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008921
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008922 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008924 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008925 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008927 if (i && seplen != 0) {
8928 copied = PyUnicode_CopyCharacters(res, res_offset,
8929 sep, 0, seplen);
8930 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008931 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008932#ifdef Py_DEBUG
8933 res_offset += copied;
8934#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008936#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008938 itemlen = PyUnicode_GET_LENGTH(item);
8939 if (itemlen != 0) {
8940 copied = PyUnicode_CopyCharacters(res, res_offset,
8941 item, 0, itemlen);
8942 if (copied < 0)
8943 goto onError;
8944#ifdef Py_DEBUG
8945 res_offset += copied;
8946#else
8947 res_offset += itemlen;
8948#endif
8949 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008952
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008954 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 Py_XDECREF(sep);
8956 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008959 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008961 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 return NULL;
8963}
8964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965#define FILL(kind, data, value, start, length) \
8966 do { \
8967 Py_ssize_t i_ = 0; \
8968 assert(kind != PyUnicode_WCHAR_KIND); \
8969 switch ((kind)) { \
8970 case PyUnicode_1BYTE_KIND: { \
8971 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8972 memset(to_, (unsigned char)value, length); \
8973 break; \
8974 } \
8975 case PyUnicode_2BYTE_KIND: { \
8976 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8977 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8978 break; \
8979 } \
8980 default: { \
8981 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8982 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8983 break; \
8984 } \
8985 } \
8986 } while (0)
8987
Alexander Belopolsky40018472011-02-26 01:02:56 +00008988static PyUnicodeObject *
8989pad(PyUnicodeObject *self,
8990 Py_ssize_t left,
8991 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 PyObject *u;
8995 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008996 int kind;
8997 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998
8999 if (left < 0)
9000 left = 0;
9001 if (right < 0)
9002 right = 0;
9003
Tim Peters7a29bd52001-09-12 03:03:31 +00009004 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 Py_INCREF(self);
9006 return self;
9007 }
9008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9010 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009011 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9012 return NULL;
9013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9015 if (fill > maxchar)
9016 maxchar = fill;
9017 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009018 if (!u)
9019 return NULL;
9020
9021 kind = PyUnicode_KIND(u);
9022 data = PyUnicode_DATA(u);
9023 if (left)
9024 FILL(kind, data, fill, 0, left);
9025 if (right)
9026 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009027 if (PyUnicode_CopyCharacters(u, left,
9028 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009029 _PyUnicode_LENGTH(self)) < 0)
9030 {
9031 Py_DECREF(u);
9032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 }
9034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038
Alexander Belopolsky40018472011-02-26 01:02:56 +00009039PyObject *
9040PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
9044 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 switch(PyUnicode_KIND(string)) {
9049 case PyUnicode_1BYTE_KIND:
9050 list = ucs1lib_splitlines(
9051 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9052 PyUnicode_GET_LENGTH(string), keepends);
9053 break;
9054 case PyUnicode_2BYTE_KIND:
9055 list = ucs2lib_splitlines(
9056 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9057 PyUnicode_GET_LENGTH(string), keepends);
9058 break;
9059 case PyUnicode_4BYTE_KIND:
9060 list = ucs4lib_splitlines(
9061 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9062 PyUnicode_GET_LENGTH(string), keepends);
9063 break;
9064 default:
9065 assert(0);
9066 list = 0;
9067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068 Py_DECREF(string);
9069 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070}
9071
Alexander Belopolsky40018472011-02-26 01:02:56 +00009072static PyObject *
9073split(PyUnicodeObject *self,
9074 PyUnicodeObject *substring,
9075 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 int kind1, kind2, kind;
9078 void *buf1, *buf2;
9079 Py_ssize_t len1, len2;
9080 PyObject* out;
9081
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009083 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 if (PyUnicode_READY(self) == -1)
9086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (substring == NULL)
9089 switch(PyUnicode_KIND(self)) {
9090 case PyUnicode_1BYTE_KIND:
9091 return ucs1lib_split_whitespace(
9092 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9093 PyUnicode_GET_LENGTH(self), maxcount
9094 );
9095 case PyUnicode_2BYTE_KIND:
9096 return ucs2lib_split_whitespace(
9097 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9098 PyUnicode_GET_LENGTH(self), maxcount
9099 );
9100 case PyUnicode_4BYTE_KIND:
9101 return ucs4lib_split_whitespace(
9102 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9103 PyUnicode_GET_LENGTH(self), maxcount
9104 );
9105 default:
9106 assert(0);
9107 return NULL;
9108 }
9109
9110 if (PyUnicode_READY(substring) == -1)
9111 return NULL;
9112
9113 kind1 = PyUnicode_KIND(self);
9114 kind2 = PyUnicode_KIND(substring);
9115 kind = kind1 > kind2 ? kind1 : kind2;
9116 buf1 = PyUnicode_DATA(self);
9117 buf2 = PyUnicode_DATA(substring);
9118 if (kind1 != kind)
9119 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9120 if (!buf1)
9121 return NULL;
9122 if (kind2 != kind)
9123 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9124 if (!buf2) {
9125 if (kind1 != kind) PyMem_Free(buf1);
9126 return NULL;
9127 }
9128 len1 = PyUnicode_GET_LENGTH(self);
9129 len2 = PyUnicode_GET_LENGTH(substring);
9130
9131 switch(kind) {
9132 case PyUnicode_1BYTE_KIND:
9133 out = ucs1lib_split(
9134 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9135 break;
9136 case PyUnicode_2BYTE_KIND:
9137 out = ucs2lib_split(
9138 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9139 break;
9140 case PyUnicode_4BYTE_KIND:
9141 out = ucs4lib_split(
9142 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9143 break;
9144 default:
9145 out = NULL;
9146 }
9147 if (kind1 != kind)
9148 PyMem_Free(buf1);
9149 if (kind2 != kind)
9150 PyMem_Free(buf2);
9151 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152}
9153
Alexander Belopolsky40018472011-02-26 01:02:56 +00009154static PyObject *
9155rsplit(PyUnicodeObject *self,
9156 PyUnicodeObject *substring,
9157 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 int kind1, kind2, kind;
9160 void *buf1, *buf2;
9161 Py_ssize_t len1, len2;
9162 PyObject* out;
9163
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009164 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009165 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 if (PyUnicode_READY(self) == -1)
9168 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 if (substring == NULL)
9171 switch(PyUnicode_KIND(self)) {
9172 case PyUnicode_1BYTE_KIND:
9173 return ucs1lib_rsplit_whitespace(
9174 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9175 PyUnicode_GET_LENGTH(self), maxcount
9176 );
9177 case PyUnicode_2BYTE_KIND:
9178 return ucs2lib_rsplit_whitespace(
9179 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9180 PyUnicode_GET_LENGTH(self), maxcount
9181 );
9182 case PyUnicode_4BYTE_KIND:
9183 return ucs4lib_rsplit_whitespace(
9184 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9185 PyUnicode_GET_LENGTH(self), maxcount
9186 );
9187 default:
9188 assert(0);
9189 return NULL;
9190 }
9191
9192 if (PyUnicode_READY(substring) == -1)
9193 return NULL;
9194
9195 kind1 = PyUnicode_KIND(self);
9196 kind2 = PyUnicode_KIND(substring);
9197 kind = kind1 > kind2 ? kind1 : kind2;
9198 buf1 = PyUnicode_DATA(self);
9199 buf2 = PyUnicode_DATA(substring);
9200 if (kind1 != kind)
9201 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9202 if (!buf1)
9203 return NULL;
9204 if (kind2 != kind)
9205 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9206 if (!buf2) {
9207 if (kind1 != kind) PyMem_Free(buf1);
9208 return NULL;
9209 }
9210 len1 = PyUnicode_GET_LENGTH(self);
9211 len2 = PyUnicode_GET_LENGTH(substring);
9212
9213 switch(kind) {
9214 case PyUnicode_1BYTE_KIND:
9215 out = ucs1lib_rsplit(
9216 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9217 break;
9218 case PyUnicode_2BYTE_KIND:
9219 out = ucs2lib_rsplit(
9220 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9221 break;
9222 case PyUnicode_4BYTE_KIND:
9223 out = ucs4lib_rsplit(
9224 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9225 break;
9226 default:
9227 out = NULL;
9228 }
9229 if (kind1 != kind)
9230 PyMem_Free(buf1);
9231 if (kind2 != kind)
9232 PyMem_Free(buf2);
9233 return out;
9234}
9235
9236static Py_ssize_t
9237anylib_find(int kind, void *buf1, Py_ssize_t len1,
9238 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9239{
9240 switch(kind) {
9241 case PyUnicode_1BYTE_KIND:
9242 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9243 case PyUnicode_2BYTE_KIND:
9244 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9245 case PyUnicode_4BYTE_KIND:
9246 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9247 }
9248 assert(0);
9249 return -1;
9250}
9251
9252static Py_ssize_t
9253anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9254 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9255{
9256 switch(kind) {
9257 case PyUnicode_1BYTE_KIND:
9258 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9259 case PyUnicode_2BYTE_KIND:
9260 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9261 case PyUnicode_4BYTE_KIND:
9262 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9263 }
9264 assert(0);
9265 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009266}
9267
Alexander Belopolsky40018472011-02-26 01:02:56 +00009268static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269replace(PyObject *self, PyObject *str1,
9270 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 PyObject *u;
9273 char *sbuf = PyUnicode_DATA(self);
9274 char *buf1 = PyUnicode_DATA(str1);
9275 char *buf2 = PyUnicode_DATA(str2);
9276 int srelease = 0, release1 = 0, release2 = 0;
9277 int skind = PyUnicode_KIND(self);
9278 int kind1 = PyUnicode_KIND(str1);
9279 int kind2 = PyUnicode_KIND(str2);
9280 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9281 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9282 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283
9284 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009285 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009287 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 if (skind < kind1)
9290 /* substring too wide to be present */
9291 goto nothing;
9292
9293 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009294 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009295 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009297 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009299 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 Py_UCS4 u1, u2, maxchar;
9301 int mayshrink, rkind;
9302 u1 = PyUnicode_READ_CHAR(str1, 0);
9303 if (!findchar(sbuf, PyUnicode_KIND(self),
9304 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 u2 = PyUnicode_READ_CHAR(str2, 0);
9307 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9308 /* Replacing u1 with u2 may cause a maxchar reduction in the
9309 result string. */
9310 mayshrink = maxchar > 127;
9311 if (u2 > maxchar) {
9312 maxchar = u2;
9313 mayshrink = 0;
9314 }
9315 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009316 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009318 if (PyUnicode_CopyCharacters(u, 0,
9319 (PyObject*)self, 0, slen) < 0)
9320 {
9321 Py_DECREF(u);
9322 return NULL;
9323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 rkind = PyUnicode_KIND(u);
9325 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9326 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009327 if (--maxcount < 0)
9328 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 if (mayshrink) {
9332 PyObject *tmp = u;
9333 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9334 PyUnicode_GET_LENGTH(tmp));
9335 Py_DECREF(tmp);
9336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 int rkind = skind;
9339 char *res;
9340 if (kind1 < rkind) {
9341 /* widen substring */
9342 buf1 = _PyUnicode_AsKind(str1, rkind);
9343 if (!buf1) goto error;
9344 release1 = 1;
9345 }
9346 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009347 if (i < 0)
9348 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 if (rkind > kind2) {
9350 /* widen replacement */
9351 buf2 = _PyUnicode_AsKind(str2, rkind);
9352 if (!buf2) goto error;
9353 release2 = 1;
9354 }
9355 else if (rkind < kind2) {
9356 /* widen self and buf1 */
9357 rkind = kind2;
9358 if (release1) PyMem_Free(buf1);
9359 sbuf = _PyUnicode_AsKind(self, rkind);
9360 if (!sbuf) goto error;
9361 srelease = 1;
9362 buf1 = _PyUnicode_AsKind(str1, rkind);
9363 if (!buf1) goto error;
9364 release1 = 1;
9365 }
9366 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9367 if (!res) {
9368 PyErr_NoMemory();
9369 goto error;
9370 }
9371 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009372 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9374 buf2,
9375 PyUnicode_KIND_SIZE(rkind, len2));
9376 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009377
9378 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9380 slen-i,
9381 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009382 if (i == -1)
9383 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9385 buf2,
9386 PyUnicode_KIND_SIZE(rkind, len2));
9387 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389
9390 u = PyUnicode_FromKindAndData(rkind, res, slen);
9391 PyMem_Free(res);
9392 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 Py_ssize_t n, i, j, ires;
9397 Py_ssize_t product, new_size;
9398 int rkind = skind;
9399 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (kind1 < rkind) {
9402 buf1 = _PyUnicode_AsKind(str1, rkind);
9403 if (!buf1) goto error;
9404 release1 = 1;
9405 }
9406 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 if (n == 0)
9408 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 if (kind2 < rkind) {
9410 buf2 = _PyUnicode_AsKind(str2, rkind);
9411 if (!buf2) goto error;
9412 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 else if (kind2 > rkind) {
9415 rkind = kind2;
9416 sbuf = _PyUnicode_AsKind(self, rkind);
9417 if (!sbuf) goto error;
9418 srelease = 1;
9419 if (release1) PyMem_Free(buf1);
9420 buf1 = _PyUnicode_AsKind(str1, rkind);
9421 if (!buf1) goto error;
9422 release1 = 1;
9423 }
9424 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9425 PyUnicode_GET_LENGTH(str1))); */
9426 product = n * (len2-len1);
9427 if ((product / (len2-len1)) != n) {
9428 PyErr_SetString(PyExc_OverflowError,
9429 "replace string is too long");
9430 goto error;
9431 }
9432 new_size = slen + product;
9433 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9434 PyErr_SetString(PyExc_OverflowError,
9435 "replace string is too long");
9436 goto error;
9437 }
9438 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9439 if (!res)
9440 goto error;
9441 ires = i = 0;
9442 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009443 while (n-- > 0) {
9444 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 j = anylib_find(rkind,
9446 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9447 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009448 if (j == -1)
9449 break;
9450 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009451 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9453 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9454 PyUnicode_KIND_SIZE(rkind, j-i));
9455 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009456 }
9457 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (len2 > 0) {
9459 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9460 buf2,
9461 PyUnicode_KIND_SIZE(rkind, len2));
9462 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9469 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9470 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009471 } else {
9472 /* interleave */
9473 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9475 buf2,
9476 PyUnicode_KIND_SIZE(rkind, len2));
9477 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 if (--n <= 0)
9479 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9481 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9482 PyUnicode_KIND_SIZE(rkind, 1));
9483 ires++;
9484 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9487 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9488 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009491 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 if (srelease)
9494 PyMem_FREE(sbuf);
9495 if (release1)
9496 PyMem_FREE(buf1);
9497 if (release2)
9498 PyMem_FREE(buf2);
9499 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009500
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009502 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (srelease)
9504 PyMem_FREE(sbuf);
9505 if (release1)
9506 PyMem_FREE(buf1);
9507 if (release2)
9508 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509 if (PyUnicode_CheckExact(self)) {
9510 Py_INCREF(self);
9511 return (PyObject *) self;
9512 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009513 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 error:
9515 if (srelease && sbuf)
9516 PyMem_FREE(sbuf);
9517 if (release1 && buf1)
9518 PyMem_FREE(buf1);
9519 if (release2 && buf2)
9520 PyMem_FREE(buf2);
9521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
9523
9524/* --- Unicode Object Methods --------------------------------------------- */
9525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009526PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528\n\
9529Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009530characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531
9532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009533unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 return fixup(self, fixtitle);
9536}
9537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009538PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540\n\
9541Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009542have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543
9544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009545unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 return fixup(self, fixcapitalize);
9548}
9549
9550#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009551PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553\n\
9554Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009555normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556
9557static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009558unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559{
9560 PyObject *list;
9561 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009562 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 /* Split into words */
9565 list = split(self, NULL, -1);
9566 if (!list)
9567 return NULL;
9568
9569 /* Capitalize each word */
9570 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9571 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 if (item == NULL)
9574 goto onError;
9575 Py_DECREF(PyList_GET_ITEM(list, i));
9576 PyList_SET_ITEM(list, i, item);
9577 }
9578
9579 /* Join the words to form a new string */
9580 item = PyUnicode_Join(NULL, list);
9581
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 Py_DECREF(list);
9584 return (PyObject *)item;
9585}
9586#endif
9587
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009588/* Argument converter. Coerces to a single unicode character */
9589
9590static int
9591convert_uc(PyObject *obj, void *addr)
9592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009595
Benjamin Peterson14339b62009-01-31 16:36:08 +00009596 uniobj = PyUnicode_FromObject(obj);
9597 if (uniobj == NULL) {
9598 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 return 0;
9601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 Py_DECREF(uniobj);
9606 return 0;
9607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009609 Py_DECREF(uniobj);
9610 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009611}
9612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009613PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009616Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009617done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618
9619static PyObject *
9620unicode_center(PyUnicodeObject *self, PyObject *args)
9621{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009622 Py_ssize_t marg, left;
9623 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 Py_UCS4 fillchar = ' ';
9625
Victor Stinnere9a29352011-10-01 02:14:59 +02009626 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Victor Stinnere9a29352011-10-01 02:14:59 +02009629 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 return NULL;
9631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633 Py_INCREF(self);
9634 return (PyObject*) self;
9635 }
9636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 left = marg / 2 + (marg & width & 1);
9639
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009640 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Marc-André Lemburge5034372000-08-08 08:04:29 +00009643#if 0
9644
9645/* This code should go into some future Unicode collation support
9646 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009647 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009648
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009649/* speedy UTF-16 code point order comparison */
9650/* gleaned from: */
9651/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9652
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009653static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009654{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009655 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009656 0, 0, 0, 0, 0, 0, 0, 0,
9657 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009658 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009659};
9660
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661static int
9662unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009664 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009665
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 Py_UNICODE *s1 = str1->str;
9667 Py_UNICODE *s2 = str2->str;
9668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 len1 = str1->_base._base.length;
9670 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009671
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009673 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009674
9675 c1 = *s1++;
9676 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009677
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 if (c1 > (1<<11) * 26)
9679 c1 += utf16Fixup[c1>>11];
9680 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009681 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009682 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009683
9684 if (c1 != c2)
9685 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009686
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009687 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
9689
9690 return (len1 < len2) ? -1 : (len1 != len2);
9691}
9692
Marc-André Lemburge5034372000-08-08 08:04:29 +00009693#else
9694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695/* This function assumes that str1 and str2 are readied by the caller. */
9696
Marc-André Lemburge5034372000-08-08 08:04:29 +00009697static int
9698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 int kind1, kind2;
9701 void *data1, *data2;
9702 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 kind1 = PyUnicode_KIND(str1);
9705 kind2 = PyUnicode_KIND(str2);
9706 data1 = PyUnicode_DATA(str1);
9707 data2 = PyUnicode_DATA(str2);
9708 len1 = PyUnicode_GET_LENGTH(str1);
9709 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 for (i = 0; i < len1 && i < len2; ++i) {
9712 Py_UCS4 c1, c2;
9713 c1 = PyUnicode_READ(kind1, data1, i);
9714 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009715
9716 if (c1 != c2)
9717 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009718 }
9719
9720 return (len1 < len2) ? -1 : (len1 != len2);
9721}
9722
9723#endif
9724
Alexander Belopolsky40018472011-02-26 01:02:56 +00009725int
9726PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9729 if (PyUnicode_READY(left) == -1 ||
9730 PyUnicode_READY(right) == -1)
9731 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009732 return unicode_compare((PyUnicodeObject *)left,
9733 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009735 PyErr_Format(PyExc_TypeError,
9736 "Can't compare %.100s and %.100s",
9737 left->ob_type->tp_name,
9738 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739 return -1;
9740}
9741
Martin v. Löwis5b222132007-06-10 09:51:05 +00009742int
9743PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 Py_ssize_t i;
9746 int kind;
9747 void *data;
9748 Py_UCS4 chr;
9749
Victor Stinner910337b2011-10-03 03:20:16 +02009750 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (PyUnicode_READY(uni) == -1)
9752 return -1;
9753 kind = PyUnicode_KIND(uni);
9754 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009755 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9757 if (chr != str[i])
9758 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009759 /* This check keeps Python strings that end in '\0' from comparing equal
9760 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009763 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009765 return 0;
9766}
9767
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009768
Benjamin Peterson29060642009-01-31 22:14:21 +00009769#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009771
Alexander Belopolsky40018472011-02-26 01:02:56 +00009772PyObject *
9773PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009774{
9775 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009776
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009777 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9778 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 if (PyUnicode_READY(left) == -1 ||
9780 PyUnicode_READY(right) == -1)
9781 return NULL;
9782 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9783 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009784 if (op == Py_EQ) {
9785 Py_INCREF(Py_False);
9786 return Py_False;
9787 }
9788 if (op == Py_NE) {
9789 Py_INCREF(Py_True);
9790 return Py_True;
9791 }
9792 }
9793 if (left == right)
9794 result = 0;
9795 else
9796 result = unicode_compare((PyUnicodeObject *)left,
9797 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009798
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009799 /* Convert the return value to a Boolean */
9800 switch (op) {
9801 case Py_EQ:
9802 v = TEST_COND(result == 0);
9803 break;
9804 case Py_NE:
9805 v = TEST_COND(result != 0);
9806 break;
9807 case Py_LE:
9808 v = TEST_COND(result <= 0);
9809 break;
9810 case Py_GE:
9811 v = TEST_COND(result >= 0);
9812 break;
9813 case Py_LT:
9814 v = TEST_COND(result == -1);
9815 break;
9816 case Py_GT:
9817 v = TEST_COND(result == 1);
9818 break;
9819 default:
9820 PyErr_BadArgument();
9821 return NULL;
9822 }
9823 Py_INCREF(v);
9824 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826
Brian Curtindfc80e32011-08-10 20:28:54 -05009827 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009828}
9829
Alexander Belopolsky40018472011-02-26 01:02:56 +00009830int
9831PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009832{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009833 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 int kind1, kind2, kind;
9835 void *buf1, *buf2;
9836 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009837 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009838
9839 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840 sub = PyUnicode_FromObject(element);
9841 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009842 PyErr_Format(PyExc_TypeError,
9843 "'in <string>' requires string as left operand, not %s",
9844 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009845 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 if (PyUnicode_READY(sub) == -1)
9848 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009849
Thomas Wouters477c8d52006-05-27 19:21:47 +00009850 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009851 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009852 Py_DECREF(sub);
9853 return -1;
9854 }
9855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 kind1 = PyUnicode_KIND(str);
9857 kind2 = PyUnicode_KIND(sub);
9858 kind = kind1 > kind2 ? kind1 : kind2;
9859 buf1 = PyUnicode_DATA(str);
9860 buf2 = PyUnicode_DATA(sub);
9861 if (kind1 != kind)
9862 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9863 if (!buf1) {
9864 Py_DECREF(sub);
9865 return -1;
9866 }
9867 if (kind2 != kind)
9868 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9869 if (!buf2) {
9870 Py_DECREF(sub);
9871 if (kind1 != kind) PyMem_Free(buf1);
9872 return -1;
9873 }
9874 len1 = PyUnicode_GET_LENGTH(str);
9875 len2 = PyUnicode_GET_LENGTH(sub);
9876
9877 switch(kind) {
9878 case PyUnicode_1BYTE_KIND:
9879 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9880 break;
9881 case PyUnicode_2BYTE_KIND:
9882 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9883 break;
9884 case PyUnicode_4BYTE_KIND:
9885 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9886 break;
9887 default:
9888 result = -1;
9889 assert(0);
9890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009891
9892 Py_DECREF(str);
9893 Py_DECREF(sub);
9894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 if (kind1 != kind)
9896 PyMem_Free(buf1);
9897 if (kind2 != kind)
9898 PyMem_Free(buf2);
9899
Guido van Rossum403d68b2000-03-13 15:55:09 +00009900 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009901}
9902
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903/* Concat to string or Unicode object giving a new Unicode object. */
9904
Alexander Belopolsky40018472011-02-26 01:02:56 +00009905PyObject *
9906PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 PyObject *u = NULL, *v = NULL, *w;
9909 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910
9911 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
9919 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009920 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009921 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009924 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927 }
9928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009930 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 w = PyUnicode_New(
9934 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9935 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009938 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9939 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009940 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009941 v, 0,
9942 PyUnicode_GET_LENGTH(v)) < 0)
9943 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 Py_DECREF(u);
9945 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
Benjamin Peterson29060642009-01-31 22:14:21 +00009948 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 Py_XDECREF(u);
9950 Py_XDECREF(v);
9951 return NULL;
9952}
9953
Walter Dörwald1ab83302007-05-18 17:15:44 +00009954void
Victor Stinner23e56682011-10-03 03:54:37 +02009955PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009956{
Victor Stinner23e56682011-10-03 03:54:37 +02009957 PyObject *left, *res;
9958
9959 if (p_left == NULL) {
9960 if (!PyErr_Occurred())
9961 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009962 return;
9963 }
Victor Stinner23e56682011-10-03 03:54:37 +02009964 left = *p_left;
9965 if (right == NULL || !PyUnicode_Check(left)) {
9966 if (!PyErr_Occurred())
9967 PyErr_BadInternalCall();
9968 goto error;
9969 }
9970
9971 if (PyUnicode_CheckExact(left) && left != unicode_empty
9972 && PyUnicode_CheckExact(right) && right != unicode_empty
9973 && unicode_resizable(left)
9974 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9975 || _PyUnicode_WSTR(left) != NULL))
9976 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009977 Py_ssize_t left_len, right_len, new_len;
9978#ifdef Py_DEBUG
9979 Py_ssize_t copied;
9980#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009981
Victor Stinner23e56682011-10-03 03:54:37 +02009982 if (PyUnicode_READY(left))
9983 goto error;
9984 if (PyUnicode_READY(right))
9985 goto error;
9986
9987 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9988 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9989 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009990 left_len = PyUnicode_GET_LENGTH(left);
9991 right_len = PyUnicode_GET_LENGTH(right);
9992 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +02009993 PyErr_SetString(PyExc_OverflowError,
9994 "strings are too large to concat");
9995 goto error;
9996 }
Victor Stinnerb8038952011-10-03 23:27:56 +02009997 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +02009998
9999 /* Now we own the last reference to 'left', so we can resize it
10000 * in-place.
10001 */
10002 if (unicode_resize(&left, new_len) != 0) {
10003 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10004 * deallocated so it cannot be put back into
10005 * 'variable'. The MemoryError is raised when there
10006 * is no value in 'variable', which might (very
10007 * remotely) be a cause of incompatibilities.
10008 */
10009 goto error;
10010 }
10011 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +020010012#ifdef Py_DEBUG
10013 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +020010014 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +020010015 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010016 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +020010017#else
10018 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10019#endif
Victor Stinner23e56682011-10-03 03:54:37 +020010020 *p_left = left;
10021 return;
10022 }
10023 }
10024
10025 res = PyUnicode_Concat(left, right);
10026 if (res == NULL)
10027 goto error;
10028 Py_DECREF(left);
10029 *p_left = res;
10030 return;
10031
10032error:
10033 Py_DECREF(*p_left);
10034 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010035}
10036
10037void
10038PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10039{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010040 PyUnicode_Append(pleft, right);
10041 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010042}
10043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010044PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010045 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010047Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010048string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010049interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050
10051static PyObject *
10052unicode_count(PyUnicodeObject *self, PyObject *args)
10053{
10054 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010055 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010056 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 int kind1, kind2, kind;
10059 void *buf1, *buf2;
10060 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061
Jesus Ceaac451502011-04-20 17:09:23 +020010062 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10063 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 kind1 = PyUnicode_KIND(self);
10067 kind2 = PyUnicode_KIND(substring);
10068 kind = kind1 > kind2 ? kind1 : kind2;
10069 buf1 = PyUnicode_DATA(self);
10070 buf2 = PyUnicode_DATA(substring);
10071 if (kind1 != kind)
10072 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10073 if (!buf1) {
10074 Py_DECREF(substring);
10075 return NULL;
10076 }
10077 if (kind2 != kind)
10078 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10079 if (!buf2) {
10080 Py_DECREF(substring);
10081 if (kind1 != kind) PyMem_Free(buf1);
10082 return NULL;
10083 }
10084 len1 = PyUnicode_GET_LENGTH(self);
10085 len2 = PyUnicode_GET_LENGTH(substring);
10086
10087 ADJUST_INDICES(start, end, len1);
10088 switch(kind) {
10089 case PyUnicode_1BYTE_KIND:
10090 iresult = ucs1lib_count(
10091 ((Py_UCS1*)buf1) + start, end - start,
10092 buf2, len2, PY_SSIZE_T_MAX
10093 );
10094 break;
10095 case PyUnicode_2BYTE_KIND:
10096 iresult = ucs2lib_count(
10097 ((Py_UCS2*)buf1) + start, end - start,
10098 buf2, len2, PY_SSIZE_T_MAX
10099 );
10100 break;
10101 case PyUnicode_4BYTE_KIND:
10102 iresult = ucs4lib_count(
10103 ((Py_UCS4*)buf1) + start, end - start,
10104 buf2, len2, PY_SSIZE_T_MAX
10105 );
10106 break;
10107 default:
10108 assert(0); iresult = 0;
10109 }
10110
10111 result = PyLong_FromSsize_t(iresult);
10112
10113 if (kind1 != kind)
10114 PyMem_Free(buf1);
10115 if (kind2 != kind)
10116 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117
10118 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120 return result;
10121}
10122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010123PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010124 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010126Encode S using the codec registered for encoding. Default encoding\n\
10127is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010128handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010129a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10130'xmlcharrefreplace' as well as any other name registered with\n\
10131codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
10133static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010134unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010136 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 char *encoding = NULL;
10138 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010139
Benjamin Peterson308d6372009-09-18 21:42:35 +000010140 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10141 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010143 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010144}
10145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010146PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010147 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148\n\
10149Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010150If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
10152static PyObject*
10153unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10154{
10155 Py_UNICODE *e;
10156 Py_UNICODE *p;
10157 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010158 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160 PyUnicodeObject *u;
10161 int tabsize = 8;
10162
10163 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10167 return NULL;
10168
Thomas Wouters7e474022000-07-16 12:04:32 +000010169 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010170 i = 0; /* chars up to and including most recent \n or \r */
10171 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10173 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010175 if (tabsize > 0) {
10176 incr = tabsize - (j % tabsize); /* cannot overflow */
10177 if (j > PY_SSIZE_T_MAX - incr)
10178 goto overflow1;
10179 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010180 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 if (j > PY_SSIZE_T_MAX - 1)
10184 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185 j++;
10186 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 if (i > PY_SSIZE_T_MAX - j)
10188 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010190 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191 }
10192 }
10193
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010194 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010196
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 /* Second pass: create output string and fill it */
10198 u = _PyUnicode_New(i + j);
10199 if (!u)
10200 return NULL;
10201
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010202 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 q = _PyUnicode_WSTR(u); /* next output char */
10204 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 if (tabsize > 0) {
10209 i = tabsize - (j % tabsize);
10210 j += i;
10211 while (i--) {
10212 if (q >= qe)
10213 goto overflow2;
10214 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010215 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010217 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 else {
10219 if (q >= qe)
10220 goto overflow2;
10221 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010222 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 if (*p == '\n' || *p == '\r')
10224 j = 0;
10225 }
10226
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010227 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 Py_DECREF(u);
10229 return NULL;
10230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010232
10233 overflow2:
10234 Py_DECREF(u);
10235 overflow1:
10236 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238}
10239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010240PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010241 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242\n\
10243Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010244such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245arguments start and end are interpreted as in slice notation.\n\
10246\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010247Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
10249static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251{
Jesus Ceaac451502011-04-20 17:09:23 +020010252 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010253 Py_ssize_t start;
10254 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
Jesus Ceaac451502011-04-20 17:09:23 +020010257 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10258 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 if (PyUnicode_READY(self) == -1)
10262 return NULL;
10263 if (PyUnicode_READY(substring) == -1)
10264 return NULL;
10265
10266 result = any_find_slice(
10267 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10268 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
10271 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (result == -2)
10274 return NULL;
10275
Christian Heimes217cfd12007-12-02 14:31:20 +000010276 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
10279static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010280unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010282 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10283 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286}
10287
Guido van Rossumc2504932007-09-18 19:42:40 +000010288/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010289 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010290static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010291unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292{
Guido van Rossumc2504932007-09-18 19:42:40 +000010293 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010294 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (_PyUnicode_HASH(self) != -1)
10297 return _PyUnicode_HASH(self);
10298 if (PyUnicode_READY(self) == -1)
10299 return -1;
10300 len = PyUnicode_GET_LENGTH(self);
10301
10302 /* The hash function as a macro, gets expanded three times below. */
10303#define HASH(P) \
10304 x = (Py_uhash_t)*P << 7; \
10305 while (--len >= 0) \
10306 x = (1000003*x) ^ (Py_uhash_t)*P++;
10307
10308 switch (PyUnicode_KIND(self)) {
10309 case PyUnicode_1BYTE_KIND: {
10310 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10311 HASH(c);
10312 break;
10313 }
10314 case PyUnicode_2BYTE_KIND: {
10315 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10316 HASH(s);
10317 break;
10318 }
10319 default: {
10320 Py_UCS4 *l;
10321 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10322 "Impossible switch case in unicode_hash");
10323 l = PyUnicode_4BYTE_DATA(self);
10324 HASH(l);
10325 break;
10326 }
10327 }
10328 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10329
Guido van Rossumc2504932007-09-18 19:42:40 +000010330 if (x == -1)
10331 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010333 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010337PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010340Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341
10342static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010345 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010346 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010347 Py_ssize_t start;
10348 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349
Jesus Ceaac451502011-04-20 17:09:23 +020010350 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10351 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 if (PyUnicode_READY(self) == -1)
10355 return NULL;
10356 if (PyUnicode_READY(substring) == -1)
10357 return NULL;
10358
10359 result = any_find_slice(
10360 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10361 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010362 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363
10364 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (result == -2)
10367 return NULL;
10368
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 if (result < 0) {
10370 PyErr_SetString(PyExc_ValueError, "substring not found");
10371 return NULL;
10372 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010373
Christian Heimes217cfd12007-12-02 14:31:20 +000010374 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375}
10376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010377PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010378 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010380Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010381at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
10383static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010384unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 Py_ssize_t i, length;
10387 int kind;
10388 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 int cased;
10390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 if (PyUnicode_READY(self) == -1)
10392 return NULL;
10393 length = PyUnicode_GET_LENGTH(self);
10394 kind = PyUnicode_KIND(self);
10395 data = PyUnicode_DATA(self);
10396
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 if (length == 1)
10399 return PyBool_FromLong(
10400 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010402 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010405
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 for (i = 0; i < length; i++) {
10408 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010409
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10411 return PyBool_FromLong(0);
10412 else if (!cased && Py_UNICODE_ISLOWER(ch))
10413 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010415 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416}
10417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010418PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010421Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010422at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
10424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010425unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 Py_ssize_t i, length;
10428 int kind;
10429 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 int cased;
10431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (PyUnicode_READY(self) == -1)
10433 return NULL;
10434 length = PyUnicode_GET_LENGTH(self);
10435 kind = PyUnicode_KIND(self);
10436 data = PyUnicode_DATA(self);
10437
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (length == 1)
10440 return PyBool_FromLong(
10441 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010443 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010446
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 for (i = 0; i < length; i++) {
10449 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010450
Benjamin Peterson29060642009-01-31 22:14:21 +000010451 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10452 return PyBool_FromLong(0);
10453 else if (!cased && Py_UNICODE_ISUPPER(ch))
10454 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010456 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457}
10458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010459PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010460 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010462Return True if S is a titlecased string and there is at least one\n\
10463character in S, i.e. upper- and titlecase characters may only\n\
10464follow uncased characters and lowercase characters only cased ones.\n\
10465Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466
10467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010468unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 Py_ssize_t i, length;
10471 int kind;
10472 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473 int cased, previous_is_cased;
10474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (PyUnicode_READY(self) == -1)
10476 return NULL;
10477 length = PyUnicode_GET_LENGTH(self);
10478 kind = PyUnicode_KIND(self);
10479 data = PyUnicode_DATA(self);
10480
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (length == 1) {
10483 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10484 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10485 (Py_UNICODE_ISUPPER(ch) != 0));
10486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010488 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010491
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 cased = 0;
10493 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 for (i = 0; i < length; i++) {
10495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010496
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10498 if (previous_is_cased)
10499 return PyBool_FromLong(0);
10500 previous_is_cased = 1;
10501 cased = 1;
10502 }
10503 else if (Py_UNICODE_ISLOWER(ch)) {
10504 if (!previous_is_cased)
10505 return PyBool_FromLong(0);
10506 previous_is_cased = 1;
10507 cased = 1;
10508 }
10509 else
10510 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010512 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513}
10514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010515PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010518Return True if all characters in S are whitespace\n\
10519and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010522unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 Py_ssize_t i, length;
10525 int kind;
10526 void *data;
10527
10528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
10530 length = PyUnicode_GET_LENGTH(self);
10531 kind = PyUnicode_KIND(self);
10532 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (length == 1)
10536 return PyBool_FromLong(
10537 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010539 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 for (i = 0; i < length; i++) {
10544 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010545 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010548 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549}
10550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010551PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010553\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010554Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010555and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010556
10557static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010558unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 Py_ssize_t i, length;
10561 int kind;
10562 void *data;
10563
10564 if (PyUnicode_READY(self) == -1)
10565 return NULL;
10566 length = PyUnicode_GET_LENGTH(self);
10567 kind = PyUnicode_KIND(self);
10568 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010569
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010570 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (length == 1)
10572 return PyBool_FromLong(
10573 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010574
10575 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 for (i = 0; i < length; i++) {
10580 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010583 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010584}
10585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010586PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010588\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010589Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010591
10592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010593unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010594{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 int kind;
10596 void *data;
10597 Py_ssize_t len, i;
10598
10599 if (PyUnicode_READY(self) == -1)
10600 return NULL;
10601
10602 kind = PyUnicode_KIND(self);
10603 data = PyUnicode_DATA(self);
10604 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010605
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010606 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 if (len == 1) {
10608 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10609 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10610 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010611
10612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 for (i = 0; i < len; i++) {
10617 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010618 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010621 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010622}
10623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010624PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010625 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010627Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010628False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629
10630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010631unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 Py_ssize_t i, length;
10634 int kind;
10635 void *data;
10636
10637 if (PyUnicode_READY(self) == -1)
10638 return NULL;
10639 length = PyUnicode_GET_LENGTH(self);
10640 kind = PyUnicode_KIND(self);
10641 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (length == 1)
10645 return PyBool_FromLong(
10646 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010648 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 for (i = 0; i < length; i++) {
10653 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010656 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657}
10658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010659PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010660 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010662Return True if all characters in S are digits\n\
10663and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664
10665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010666unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 Py_ssize_t i, length;
10669 int kind;
10670 void *data;
10671
10672 if (PyUnicode_READY(self) == -1)
10673 return NULL;
10674 length = PyUnicode_GET_LENGTH(self);
10675 kind = PyUnicode_KIND(self);
10676 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (length == 1) {
10680 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10681 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010684 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 for (i = 0; i < length; i++) {
10689 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010692 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693}
10694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010695PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010696 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010698Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010699False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700
10701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010702unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 Py_ssize_t i, length;
10705 int kind;
10706 void *data;
10707
10708 if (PyUnicode_READY(self) == -1)
10709 return NULL;
10710 length = PyUnicode_GET_LENGTH(self);
10711 kind = PyUnicode_KIND(self);
10712 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (length == 1)
10716 return PyBool_FromLong(
10717 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010719 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 for (i = 0; i < length; i++) {
10724 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010727 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728}
10729
Martin v. Löwis47383402007-08-15 07:32:56 +000010730int
10731PyUnicode_IsIdentifier(PyObject *self)
10732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 int kind;
10734 void *data;
10735 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010736 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (PyUnicode_READY(self) == -1) {
10739 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 }
10742
10743 /* Special case for empty strings */
10744 if (PyUnicode_GET_LENGTH(self) == 0)
10745 return 0;
10746 kind = PyUnicode_KIND(self);
10747 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010748
10749 /* PEP 3131 says that the first character must be in
10750 XID_Start and subsequent characters in XID_Continue,
10751 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010753 letters, digits, underscore). However, given the current
10754 definition of XID_Start and XID_Continue, it is sufficient
10755 to check just for these, except that _ must be allowed
10756 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010758 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010759 return 0;
10760
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010761 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010764 return 1;
10765}
10766
10767PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010769\n\
10770Return True if S is a valid identifier according\n\
10771to the language definition.");
10772
10773static PyObject*
10774unicode_isidentifier(PyObject *self)
10775{
10776 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10777}
10778
Georg Brandl559e5d72008-06-11 18:37:52 +000010779PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010780 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010781\n\
10782Return True if all characters in S are considered\n\
10783printable in repr() or S is empty, False otherwise.");
10784
10785static PyObject*
10786unicode_isprintable(PyObject *self)
10787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 Py_ssize_t i, length;
10789 int kind;
10790 void *data;
10791
10792 if (PyUnicode_READY(self) == -1)
10793 return NULL;
10794 length = PyUnicode_GET_LENGTH(self);
10795 kind = PyUnicode_KIND(self);
10796 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010797
10798 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (length == 1)
10800 return PyBool_FromLong(
10801 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 for (i = 0; i < length; i++) {
10804 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010805 Py_RETURN_FALSE;
10806 }
10807 }
10808 Py_RETURN_TRUE;
10809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010812 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
10814Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010815iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010818unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010820 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821}
10822
Martin v. Löwis18e16552006-02-15 17:27:45 +000010823static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824unicode_length(PyUnicodeObject *self)
10825{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 if (PyUnicode_READY(self) == -1)
10827 return -1;
10828 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829}
10830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010831PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010834Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010835done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
10838unicode_ljust(PyUnicodeObject *self, PyObject *args)
10839{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010840 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 Py_UCS4 fillchar = ' ';
10842
10843 if (PyUnicode_READY(self) == -1)
10844 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010845
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010846 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847 return NULL;
10848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 Py_INCREF(self);
10851 return (PyObject*) self;
10852 }
10853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855}
10856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010857PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010858 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861
10862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010863unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 return fixup(self, fixlower);
10866}
10867
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010868#define LEFTSTRIP 0
10869#define RIGHTSTRIP 1
10870#define BOTHSTRIP 2
10871
10872/* Arrays indexed by above */
10873static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10874
10875#define STRIPNAME(i) (stripformat[i]+3)
10876
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010877/* externally visible for str.strip(unicode) */
10878PyObject *
10879_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 void *data;
10882 int kind;
10883 Py_ssize_t i, j, len;
10884 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10887 return NULL;
10888
10889 kind = PyUnicode_KIND(self);
10890 data = PyUnicode_DATA(self);
10891 len = PyUnicode_GET_LENGTH(self);
10892 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10893 PyUnicode_DATA(sepobj),
10894 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010895
Benjamin Peterson14339b62009-01-31 16:36:08 +000010896 i = 0;
10897 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 while (i < len &&
10899 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 i++;
10901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010902 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010903
Benjamin Peterson14339b62009-01-31 16:36:08 +000010904 j = len;
10905 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 do {
10907 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 } while (j >= i &&
10909 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010912
Victor Stinner12bab6d2011-10-01 01:53:49 +020010913 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914}
10915
10916PyObject*
10917PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10918{
10919 unsigned char *data;
10920 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010921 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922
Victor Stinnerde636f32011-10-01 03:55:54 +020010923 if (PyUnicode_READY(self) == -1)
10924 return NULL;
10925
10926 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10927
Victor Stinner12bab6d2011-10-01 01:53:49 +020010928 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010930 if (PyUnicode_CheckExact(self)) {
10931 Py_INCREF(self);
10932 return self;
10933 }
10934 else
10935 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 }
10937
Victor Stinner12bab6d2011-10-01 01:53:49 +020010938 length = end - start;
10939 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010940 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941
Victor Stinnerde636f32011-10-01 03:55:54 +020010942 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010943 PyErr_SetString(PyExc_IndexError, "string index out of range");
10944 return NULL;
10945 }
10946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 kind = PyUnicode_KIND(self);
10948 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010949 return PyUnicode_FromKindAndData(kind,
10950 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010951 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
10954static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010955do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 int kind;
10958 void *data;
10959 Py_ssize_t len, i, j;
10960
10961 if (PyUnicode_READY(self) == -1)
10962 return NULL;
10963
10964 kind = PyUnicode_KIND(self);
10965 data = PyUnicode_DATA(self);
10966 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010967
Benjamin Peterson14339b62009-01-31 16:36:08 +000010968 i = 0;
10969 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010971 i++;
10972 }
10973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010974
Benjamin Peterson14339b62009-01-31 16:36:08 +000010975 j = len;
10976 if (striptype != LEFTSTRIP) {
10977 do {
10978 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010980 j++;
10981 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010982
Victor Stinner12bab6d2011-10-01 01:53:49 +020010983 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984}
10985
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010986
10987static PyObject *
10988do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10989{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010990 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010991
Benjamin Peterson14339b62009-01-31 16:36:08 +000010992 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10993 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010994
Benjamin Peterson14339b62009-01-31 16:36:08 +000010995 if (sep != NULL && sep != Py_None) {
10996 if (PyUnicode_Check(sep))
10997 return _PyUnicode_XStrip(self, striptype, sep);
10998 else {
10999 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 "%s arg must be None or str",
11001 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011002 return NULL;
11003 }
11004 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011005
Benjamin Peterson14339b62009-01-31 16:36:08 +000011006 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011007}
11008
11009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011010PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011012\n\
11013Return a copy of the string S with leading and trailing\n\
11014whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011015If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011016
11017static PyObject *
11018unicode_strip(PyUnicodeObject *self, PyObject *args)
11019{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011020 if (PyTuple_GET_SIZE(args) == 0)
11021 return do_strip(self, BOTHSTRIP); /* Common case */
11022 else
11023 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011024}
11025
11026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011027PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011029\n\
11030Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011031If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011032
11033static PyObject *
11034unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11035{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011036 if (PyTuple_GET_SIZE(args) == 0)
11037 return do_strip(self, LEFTSTRIP); /* Common case */
11038 else
11039 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011040}
11041
11042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011043PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011045\n\
11046Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011047If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011048
11049static PyObject *
11050unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11051{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011052 if (PyTuple_GET_SIZE(args) == 0)
11053 return do_strip(self, RIGHTSTRIP); /* Common case */
11054 else
11055 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011056}
11057
11058
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011060unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061{
11062 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Georg Brandl222de0f2009-04-12 12:01:50 +000011065 if (len < 1) {
11066 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011067 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
Tim Peters7a29bd52001-09-12 03:03:31 +000011070 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 /* no repeat, return original string */
11072 Py_INCREF(str);
11073 return (PyObject*) str;
11074 }
Tim Peters8f422462000-09-09 06:13:41 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(str) == -1)
11077 return NULL;
11078
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011079 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011080 PyErr_SetString(PyExc_OverflowError,
11081 "repeated string is too long");
11082 return NULL;
11083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 if (!u)
11088 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011089 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (PyUnicode_GET_LENGTH(str) == 1) {
11092 const int kind = PyUnicode_KIND(str);
11093 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11094 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011095 if (kind == PyUnicode_1BYTE_KIND)
11096 memset(to, (unsigned char)fill_char, len);
11097 else {
11098 for (n = 0; n < len; ++n)
11099 PyUnicode_WRITE(kind, to, n, fill_char);
11100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 }
11102 else {
11103 /* number of characters copied this far */
11104 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11105 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11106 char *to = (char *) PyUnicode_DATA(u);
11107 Py_MEMCPY(to, PyUnicode_DATA(str),
11108 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 n = (done <= nchars-done) ? done : nchars-done;
11111 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011112 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 }
11115
11116 return (PyObject*) u;
11117}
11118
Alexander Belopolsky40018472011-02-26 01:02:56 +000011119PyObject *
11120PyUnicode_Replace(PyObject *obj,
11121 PyObject *subobj,
11122 PyObject *replobj,
11123 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
11125 PyObject *self;
11126 PyObject *str1;
11127 PyObject *str2;
11128 PyObject *result;
11129
11130 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011131 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011134 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 Py_DECREF(self);
11136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 }
11138 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011139 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 Py_DECREF(self);
11141 Py_DECREF(str1);
11142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 Py_DECREF(self);
11146 Py_DECREF(str1);
11147 Py_DECREF(str2);
11148 return result;
11149}
11150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011151PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011152 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153\n\
11154Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011155old replaced by new. If the optional argument count is\n\
11156given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
11158static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 PyObject *str1;
11162 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011163 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164 PyObject *result;
11165
Martin v. Löwis18e16552006-02-15 17:27:45 +000011166 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 str1 = PyUnicode_FromObject(str1);
11171 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11172 return NULL;
11173 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011174 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 Py_DECREF(str1);
11176 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178
11179 result = replace(self, str1, str2, maxcount);
11180
11181 Py_DECREF(str1);
11182 Py_DECREF(str2);
11183 return result;
11184}
11185
Alexander Belopolsky40018472011-02-26 01:02:56 +000011186static PyObject *
11187unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011189 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 Py_ssize_t isize;
11191 Py_ssize_t osize, squote, dquote, i, o;
11192 Py_UCS4 max, quote;
11193 int ikind, okind;
11194 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011197 return NULL;
11198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 isize = PyUnicode_GET_LENGTH(unicode);
11200 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 /* Compute length of output, quote characters, and
11203 maximum character */
11204 osize = 2; /* quotes */
11205 max = 127;
11206 squote = dquote = 0;
11207 ikind = PyUnicode_KIND(unicode);
11208 for (i = 0; i < isize; i++) {
11209 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11210 switch (ch) {
11211 case '\'': squote++; osize++; break;
11212 case '"': dquote++; osize++; break;
11213 case '\\': case '\t': case '\r': case '\n':
11214 osize += 2; break;
11215 default:
11216 /* Fast-path ASCII */
11217 if (ch < ' ' || ch == 0x7f)
11218 osize += 4; /* \xHH */
11219 else if (ch < 0x7f)
11220 osize++;
11221 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11222 osize++;
11223 max = ch > max ? ch : max;
11224 }
11225 else if (ch < 0x100)
11226 osize += 4; /* \xHH */
11227 else if (ch < 0x10000)
11228 osize += 6; /* \uHHHH */
11229 else
11230 osize += 10; /* \uHHHHHHHH */
11231 }
11232 }
11233
11234 quote = '\'';
11235 if (squote) {
11236 if (dquote)
11237 /* Both squote and dquote present. Use squote,
11238 and escape them */
11239 osize += squote;
11240 else
11241 quote = '"';
11242 }
11243
11244 repr = PyUnicode_New(osize, max);
11245 if (repr == NULL)
11246 return NULL;
11247 okind = PyUnicode_KIND(repr);
11248 odata = PyUnicode_DATA(repr);
11249
11250 PyUnicode_WRITE(okind, odata, 0, quote);
11251 PyUnicode_WRITE(okind, odata, osize-1, quote);
11252
11253 for (i = 0, o = 1; i < isize; i++) {
11254 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011255
11256 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if ((ch == quote) || (ch == '\\')) {
11258 PyUnicode_WRITE(okind, odata, o++, '\\');
11259 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011260 continue;
11261 }
11262
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011264 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 PyUnicode_WRITE(okind, odata, o++, '\\');
11266 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011267 }
11268 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 PyUnicode_WRITE(okind, odata, o++, '\\');
11270 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011271 }
11272 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 PyUnicode_WRITE(okind, odata, o++, '\\');
11274 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011275 }
11276
11277 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011278 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 PyUnicode_WRITE(okind, odata, o++, '\\');
11280 PyUnicode_WRITE(okind, odata, o++, 'x');
11281 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11282 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011283 }
11284
Georg Brandl559e5d72008-06-11 18:37:52 +000011285 /* Copy ASCII characters as-is */
11286 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011288 }
11289
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011291 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011292 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011293 (categories Z* and C* except ASCII space)
11294 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011296 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (ch <= 0xff) {
11298 PyUnicode_WRITE(okind, odata, o++, '\\');
11299 PyUnicode_WRITE(okind, odata, o++, 'x');
11300 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11301 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011302 }
11303 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 else if (ch >= 0x10000) {
11305 PyUnicode_WRITE(okind, odata, o++, '\\');
11306 PyUnicode_WRITE(okind, odata, o++, 'U');
11307 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11308 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11309 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11310 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11311 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11312 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11313 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11314 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011315 }
11316 /* Map 16-bit characters to '\uxxxx' */
11317 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 PyUnicode_WRITE(okind, odata, o++, '\\');
11319 PyUnicode_WRITE(okind, odata, o++, 'u');
11320 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11321 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11322 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11323 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011324 }
11325 }
11326 /* Copy characters as-is */
11327 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011329 }
11330 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011333 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334}
11335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338\n\
11339Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011340such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341arguments start and end are interpreted as in slice notation.\n\
11342\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
11345static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347{
Jesus Ceaac451502011-04-20 17:09:23 +020011348 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011349 Py_ssize_t start;
11350 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011351 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
Jesus Ceaac451502011-04-20 17:09:23 +020011353 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11354 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 if (PyUnicode_READY(self) == -1)
11358 return NULL;
11359 if (PyUnicode_READY(substring) == -1)
11360 return NULL;
11361
11362 result = any_find_slice(
11363 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11364 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011365 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
11367 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (result == -2)
11370 return NULL;
11371
Christian Heimes217cfd12007-12-02 14:31:20 +000011372 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373}
11374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011375PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011378Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
11380static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382{
Jesus Ceaac451502011-04-20 17:09:23 +020011383 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011384 Py_ssize_t start;
11385 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011386 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387
Jesus Ceaac451502011-04-20 17:09:23 +020011388 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11389 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (PyUnicode_READY(self) == -1)
11393 return NULL;
11394 if (PyUnicode_READY(substring) == -1)
11395 return NULL;
11396
11397 result = any_find_slice(
11398 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11399 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011400 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
11402 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (result == -2)
11405 return NULL;
11406
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 if (result < 0) {
11408 PyErr_SetString(PyExc_ValueError, "substring not found");
11409 return NULL;
11410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411
Christian Heimes217cfd12007-12-02 14:31:20 +000011412 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011418Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011419done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
11421static PyObject *
11422unicode_rjust(PyUnicodeObject *self, PyObject *args)
11423{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011424 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 Py_UCS4 fillchar = ' ';
11426
Victor Stinnere9a29352011-10-01 02:14:59 +020011427 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011429
Victor Stinnere9a29352011-10-01 02:14:59 +020011430 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 return NULL;
11432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 Py_INCREF(self);
11435 return (PyObject*) self;
11436 }
11437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439}
11440
Alexander Belopolsky40018472011-02-26 01:02:56 +000011441PyObject *
11442PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
11444 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011445
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 s = PyUnicode_FromObject(s);
11447 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011448 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 if (sep != NULL) {
11450 sep = PyUnicode_FromObject(sep);
11451 if (sep == NULL) {
11452 Py_DECREF(s);
11453 return NULL;
11454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 }
11456
11457 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11458
11459 Py_DECREF(s);
11460 Py_XDECREF(sep);
11461 return result;
11462}
11463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011464PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466\n\
11467Return a list of the words in S, using sep as the\n\
11468delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011469splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011470whitespace string is a separator and empty strings are\n\
11471removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
11473static PyObject*
11474unicode_split(PyUnicodeObject *self, PyObject *args)
11475{
11476 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011477 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Martin v. Löwis18e16552006-02-15 17:27:45 +000011479 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 return NULL;
11481
11482 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Thomas Wouters477c8d52006-05-27 19:21:47 +000011490PyObject *
11491PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11492{
11493 PyObject* str_obj;
11494 PyObject* sep_obj;
11495 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 int kind1, kind2, kind;
11497 void *buf1 = NULL, *buf2 = NULL;
11498 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499
11500 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011501 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011505 Py_DECREF(str_obj);
11506 return NULL;
11507 }
11508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 kind1 = PyUnicode_KIND(str_in);
11510 kind2 = PyUnicode_KIND(sep_obj);
11511 kind = kind1 > kind2 ? kind1 : kind2;
11512 buf1 = PyUnicode_DATA(str_in);
11513 if (kind1 != kind)
11514 buf1 = _PyUnicode_AsKind(str_in, kind);
11515 if (!buf1)
11516 goto onError;
11517 buf2 = PyUnicode_DATA(sep_obj);
11518 if (kind2 != kind)
11519 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11520 if (!buf2)
11521 goto onError;
11522 len1 = PyUnicode_GET_LENGTH(str_obj);
11523 len2 = PyUnicode_GET_LENGTH(sep_obj);
11524
11525 switch(PyUnicode_KIND(str_in)) {
11526 case PyUnicode_1BYTE_KIND:
11527 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11528 break;
11529 case PyUnicode_2BYTE_KIND:
11530 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11531 break;
11532 case PyUnicode_4BYTE_KIND:
11533 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11534 break;
11535 default:
11536 assert(0);
11537 out = 0;
11538 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011539
11540 Py_DECREF(sep_obj);
11541 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (kind1 != kind)
11543 PyMem_Free(buf1);
11544 if (kind2 != kind)
11545 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011546
11547 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 onError:
11549 Py_DECREF(sep_obj);
11550 Py_DECREF(str_obj);
11551 if (kind1 != kind && buf1)
11552 PyMem_Free(buf1);
11553 if (kind2 != kind && buf2)
11554 PyMem_Free(buf2);
11555 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011556}
11557
11558
11559PyObject *
11560PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11561{
11562 PyObject* str_obj;
11563 PyObject* sep_obj;
11564 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 int kind1, kind2, kind;
11566 void *buf1 = NULL, *buf2 = NULL;
11567 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011568
11569 str_obj = PyUnicode_FromObject(str_in);
11570 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011572 sep_obj = PyUnicode_FromObject(sep_in);
11573 if (!sep_obj) {
11574 Py_DECREF(str_obj);
11575 return NULL;
11576 }
11577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 kind1 = PyUnicode_KIND(str_in);
11579 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011580 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 buf1 = PyUnicode_DATA(str_in);
11582 if (kind1 != kind)
11583 buf1 = _PyUnicode_AsKind(str_in, kind);
11584 if (!buf1)
11585 goto onError;
11586 buf2 = PyUnicode_DATA(sep_obj);
11587 if (kind2 != kind)
11588 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11589 if (!buf2)
11590 goto onError;
11591 len1 = PyUnicode_GET_LENGTH(str_obj);
11592 len2 = PyUnicode_GET_LENGTH(sep_obj);
11593
11594 switch(PyUnicode_KIND(str_in)) {
11595 case PyUnicode_1BYTE_KIND:
11596 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11597 break;
11598 case PyUnicode_2BYTE_KIND:
11599 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11600 break;
11601 case PyUnicode_4BYTE_KIND:
11602 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11603 break;
11604 default:
11605 assert(0);
11606 out = 0;
11607 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011608
11609 Py_DECREF(sep_obj);
11610 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 if (kind1 != kind)
11612 PyMem_Free(buf1);
11613 if (kind2 != kind)
11614 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615
11616 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 onError:
11618 Py_DECREF(sep_obj);
11619 Py_DECREF(str_obj);
11620 if (kind1 != kind && buf1)
11621 PyMem_Free(buf1);
11622 if (kind2 != kind && buf2)
11623 PyMem_Free(buf2);
11624 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625}
11626
11627PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011630Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011632found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011633
11634static PyObject*
11635unicode_partition(PyUnicodeObject *self, PyObject *separator)
11636{
11637 return PyUnicode_Partition((PyObject *)self, separator);
11638}
11639
11640PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011641 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011642\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011643Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011645separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011646
11647static PyObject*
11648unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11649{
11650 return PyUnicode_RPartition((PyObject *)self, separator);
11651}
11652
Alexander Belopolsky40018472011-02-26 01:02:56 +000011653PyObject *
11654PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011655{
11656 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011658 s = PyUnicode_FromObject(s);
11659 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011660 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 if (sep != NULL) {
11662 sep = PyUnicode_FromObject(sep);
11663 if (sep == NULL) {
11664 Py_DECREF(s);
11665 return NULL;
11666 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011667 }
11668
11669 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11670
11671 Py_DECREF(s);
11672 Py_XDECREF(sep);
11673 return result;
11674}
11675
11676PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011678\n\
11679Return a list of the words in S, using sep as the\n\
11680delimiter string, starting at the end of the string and\n\
11681working to the front. If maxsplit is given, at most maxsplit\n\
11682splits are done. If sep is not specified, any whitespace string\n\
11683is a separator.");
11684
11685static PyObject*
11686unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11687{
11688 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011689 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011690
Martin v. Löwis18e16552006-02-15 17:27:45 +000011691 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011692 return NULL;
11693
11694 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011696 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011698 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011700}
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704\n\
11705Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011706Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011707is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
11709static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011710unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011712 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011713 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011715 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11716 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 return NULL;
11718
Guido van Rossum86662912000-04-11 15:38:46 +000011719 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720}
11721
11722static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011723PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Walter Dörwald346737f2007-05-31 10:44:43 +000011725 if (PyUnicode_CheckExact(self)) {
11726 Py_INCREF(self);
11727 return self;
11728 } else
11729 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011730 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731}
11732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735\n\
11736Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011740unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 return fixup(self, fixswapcase);
11743}
11744
Georg Brandlceee0772007-11-27 23:48:05 +000011745PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011747\n\
11748Return a translation table usable for str.translate().\n\
11749If there is only one argument, it must be a dictionary mapping Unicode\n\
11750ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011751Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011752If there are two arguments, they must be strings of equal length, and\n\
11753in the resulting dictionary, each character in x will be mapped to the\n\
11754character at the same position in y. If there is a third argument, it\n\
11755must be a string, whose characters will be mapped to None in the result.");
11756
11757static PyObject*
11758unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11759{
11760 PyObject *x, *y = NULL, *z = NULL;
11761 PyObject *new = NULL, *key, *value;
11762 Py_ssize_t i = 0;
11763 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011764
Georg Brandlceee0772007-11-27 23:48:05 +000011765 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11766 return NULL;
11767 new = PyDict_New();
11768 if (!new)
11769 return NULL;
11770 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 int x_kind, y_kind, z_kind;
11772 void *x_data, *y_data, *z_data;
11773
Georg Brandlceee0772007-11-27 23:48:05 +000011774 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011775 if (!PyUnicode_Check(x)) {
11776 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11777 "be a string if there is a second argument");
11778 goto err;
11779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011781 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11782 "arguments must have equal length");
11783 goto err;
11784 }
11785 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 x_kind = PyUnicode_KIND(x);
11787 y_kind = PyUnicode_KIND(y);
11788 x_data = PyUnicode_DATA(x);
11789 y_data = PyUnicode_DATA(y);
11790 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11791 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11792 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011793 if (!key || !value)
11794 goto err;
11795 res = PyDict_SetItem(new, key, value);
11796 Py_DECREF(key);
11797 Py_DECREF(value);
11798 if (res < 0)
11799 goto err;
11800 }
11801 /* create entries for deleting chars in z */
11802 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 z_kind = PyUnicode_KIND(z);
11804 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011805 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011807 if (!key)
11808 goto err;
11809 res = PyDict_SetItem(new, key, Py_None);
11810 Py_DECREF(key);
11811 if (res < 0)
11812 goto err;
11813 }
11814 }
11815 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 int kind;
11817 void *data;
11818
Georg Brandlceee0772007-11-27 23:48:05 +000011819 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011820 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011821 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11822 "to maketrans it must be a dict");
11823 goto err;
11824 }
11825 /* copy entries into the new dict, converting string keys to int keys */
11826 while (PyDict_Next(x, &i, &key, &value)) {
11827 if (PyUnicode_Check(key)) {
11828 /* convert string keys to integer keys */
11829 PyObject *newkey;
11830 if (PyUnicode_GET_SIZE(key) != 1) {
11831 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11832 "table must be of length 1");
11833 goto err;
11834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 kind = PyUnicode_KIND(key);
11836 data = PyUnicode_DATA(key);
11837 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011838 if (!newkey)
11839 goto err;
11840 res = PyDict_SetItem(new, newkey, value);
11841 Py_DECREF(newkey);
11842 if (res < 0)
11843 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011844 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011845 /* just keep integer keys */
11846 if (PyDict_SetItem(new, key, value) < 0)
11847 goto err;
11848 } else {
11849 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11850 "be strings or integers");
11851 goto err;
11852 }
11853 }
11854 }
11855 return new;
11856 err:
11857 Py_DECREF(new);
11858 return NULL;
11859}
11860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011861PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863\n\
11864Return a copy of the string S, where all characters have been mapped\n\
11865through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011866Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011867Unmapped characters are left untouched. Characters mapped to None\n\
11868are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
11870static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874}
11875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011876PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
11881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011882unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 return fixup(self, fixupper);
11885}
11886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011887PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011890Pad a numeric string S with zeros on the left, to fill a field\n\
11891of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
11893static PyObject *
11894unicode_zfill(PyUnicodeObject *self, PyObject *args)
11895{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011896 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011898 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 int kind;
11900 void *data;
11901 Py_UCS4 chr;
11902
11903 if (PyUnicode_READY(self) == -1)
11904 return NULL;
11905
Martin v. Löwis18e16552006-02-15 17:27:45 +000011906 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 return NULL;
11908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011910 if (PyUnicode_CheckExact(self)) {
11911 Py_INCREF(self);
11912 return (PyObject*) self;
11913 }
11914 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011915 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 }
11917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
11920 u = pad(self, fill, 0, '0');
11921
Walter Dörwald068325e2002-04-15 13:36:47 +000011922 if (u == NULL)
11923 return NULL;
11924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 kind = PyUnicode_KIND(u);
11926 data = PyUnicode_DATA(u);
11927 chr = PyUnicode_READ(kind, data, fill);
11928
11929 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 PyUnicode_WRITE(kind, data, 0, chr);
11932 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 }
11934
11935 return (PyObject*) u;
11936}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937
11938#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011939static PyObject *
11940unicode__decimal2ascii(PyObject *self)
11941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011943}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944#endif
11945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011949Return True if S starts with the specified prefix, False otherwise.\n\
11950With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011951With optional end, stop comparing S at that position.\n\
11952prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
11954static PyObject *
11955unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011958 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011960 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011961 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011962 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
Jesus Ceaac451502011-04-20 17:09:23 +020011964 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011966 if (PyTuple_Check(subobj)) {
11967 Py_ssize_t i;
11968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011971 if (substring == NULL)
11972 return NULL;
11973 result = tailmatch(self, substring, start, end, -1);
11974 Py_DECREF(substring);
11975 if (result) {
11976 Py_RETURN_TRUE;
11977 }
11978 }
11979 /* nothing matched */
11980 Py_RETURN_FALSE;
11981 }
11982 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011983 if (substring == NULL) {
11984 if (PyErr_ExceptionMatches(PyExc_TypeError))
11985 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11986 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011988 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011989 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011991 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
11994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011998Return True if S ends with the specified suffix, False otherwise.\n\
11999With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012000With optional end, stop comparing S at that position.\n\
12001suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
12003static PyObject *
12004unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012007 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012009 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012010 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012011 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Jesus Ceaac451502011-04-20 17:09:23 +020012013 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012015 if (PyTuple_Check(subobj)) {
12016 Py_ssize_t i;
12017 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012020 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012022 result = tailmatch(self, substring, start, end, +1);
12023 Py_DECREF(substring);
12024 if (result) {
12025 Py_RETURN_TRUE;
12026 }
12027 }
12028 Py_RETURN_FALSE;
12029 }
12030 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012031 if (substring == NULL) {
12032 if (PyErr_ExceptionMatches(PyExc_TypeError))
12033 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12034 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012036 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012037 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012039 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040}
12041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012043
12044PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012046\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012047Return a formatted version of S, using substitutions from args and kwargs.\n\
12048The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012049
Eric Smith27bbca62010-11-04 17:06:58 +000012050PyDoc_STRVAR(format_map__doc__,
12051 "S.format_map(mapping) -> str\n\
12052\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012053Return a formatted version of S, using substitutions from mapping.\n\
12054The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012055
Eric Smith4a7d76d2008-05-30 18:10:19 +000012056static PyObject *
12057unicode__format__(PyObject* self, PyObject* args)
12058{
12059 PyObject *format_spec;
12060
12061 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12062 return NULL;
12063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12065 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012066}
12067
Eric Smith8c663262007-08-25 02:26:07 +000012068PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012070\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012071Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012072
12073static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012074unicode__sizeof__(PyUnicodeObject *v)
12075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 Py_ssize_t size;
12077
12078 /* If it's a compact object, account for base structure +
12079 character data. */
12080 if (PyUnicode_IS_COMPACT_ASCII(v))
12081 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12082 else if (PyUnicode_IS_COMPACT(v))
12083 size = sizeof(PyCompactUnicodeObject) +
12084 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12085 else {
12086 /* If it is a two-block object, account for base object, and
12087 for character block if present. */
12088 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012089 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 size += (PyUnicode_GET_LENGTH(v) + 1) *
12091 PyUnicode_CHARACTER_SIZE(v);
12092 }
12093 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012094 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012095 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012097 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012098 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099
12100 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012101}
12102
12103PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012105
12106static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012107unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012108{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012109 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (!copy)
12111 return NULL;
12112 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012113}
12114
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115static PyMethodDef unicode_methods[] = {
12116
12117 /* Order is according to common usage: often used methods should
12118 appear first, since lookup is done sequentially. */
12119
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012120 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012121 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12122 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012123 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012124 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12125 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12126 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12127 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12128 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12129 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12130 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012131 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012132 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12133 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12134 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012136 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12137 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12138 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012140 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012141 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012143 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12144 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12145 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12146 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12147 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12148 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12149 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12150 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12151 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12152 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12153 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12154 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12155 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12156 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012157 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012158 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012159 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012160 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012161 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012162 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012163 {"maketrans", (PyCFunction) unicode_maketrans,
12164 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012165 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012166#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012167 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168#endif
12169
12170#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012171 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012172 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173#endif
12174
Benjamin Peterson14339b62009-01-31 16:36:08 +000012175 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 {NULL, NULL}
12177};
12178
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012179static PyObject *
12180unicode_mod(PyObject *v, PyObject *w)
12181{
Brian Curtindfc80e32011-08-10 20:28:54 -050012182 if (!PyUnicode_Check(v))
12183 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012185}
12186
12187static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012188 0, /*nb_add*/
12189 0, /*nb_subtract*/
12190 0, /*nb_multiply*/
12191 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012192};
12193
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012195 (lenfunc) unicode_length, /* sq_length */
12196 PyUnicode_Concat, /* sq_concat */
12197 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12198 (ssizeargfunc) unicode_getitem, /* sq_item */
12199 0, /* sq_slice */
12200 0, /* sq_ass_item */
12201 0, /* sq_ass_slice */
12202 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203};
12204
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012205static PyObject*
12206unicode_subscript(PyUnicodeObject* self, PyObject* item)
12207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (PyUnicode_READY(self) == -1)
12209 return NULL;
12210
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012211 if (PyIndex_Check(item)) {
12212 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012213 if (i == -1 && PyErr_Occurred())
12214 return NULL;
12215 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012217 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012218 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012219 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012221 Py_UNICODE* result_buf;
12222 PyObject* result;
12223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012226 return NULL;
12227 }
12228
12229 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 return PyUnicode_New(0, 0);
12231 } else if (start == 0 && step == 1 &&
12232 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012233 PyUnicode_CheckExact(self)) {
12234 Py_INCREF(self);
12235 return (PyObject *)self;
12236 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012237 return PyUnicode_Substring((PyObject*)self,
12238 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012239 } else {
12240 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012241 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12242 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012243
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 if (result_buf == NULL)
12245 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012246
12247 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12248 result_buf[i] = source_buf[cur];
12249 }
Tim Petersced69f82003-09-16 20:30:58 +000012250
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012251 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012252 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012253 return result;
12254 }
12255 } else {
12256 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12257 return NULL;
12258 }
12259}
12260
12261static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012262 (lenfunc)unicode_length, /* mp_length */
12263 (binaryfunc)unicode_subscript, /* mp_subscript */
12264 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012265};
12266
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268/* Helpers for PyUnicode_Format() */
12269
12270static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012271getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 (*p_argidx)++;
12276 if (arglen < 0)
12277 return args;
12278 else
12279 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 }
12281 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 return NULL;
12284}
12285
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012286/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012288static PyObject *
12289formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012291 char *p;
12292 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012294
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295 x = PyFloat_AsDouble(v);
12296 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012297 return NULL;
12298
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012301
Eric Smith0923d1d2009-04-16 20:16:10 +000012302 p = PyOS_double_to_string(x, type, prec,
12303 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012304 if (p == NULL)
12305 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012307 PyMem_Free(p);
12308 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309}
12310
Tim Peters38fd5b62000-09-21 05:43:11 +000012311static PyObject*
12312formatlong(PyObject *val, int flags, int prec, int type)
12313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 char *buf;
12315 int len;
12316 PyObject *str; /* temporary string object. */
12317 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012318
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12320 if (!str)
12321 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 Py_DECREF(str);
12324 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012325}
12326
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012329 size_t buflen,
12330 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012332 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012333 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 if (PyUnicode_GET_LENGTH(v) == 1) {
12335 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 buf[1] = '\0';
12337 return 1;
12338 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 goto onError;
12340 }
12341 else {
12342 /* Integer input truncated to a character */
12343 long x;
12344 x = PyLong_AsLong(v);
12345 if (x == -1 && PyErr_Occurred())
12346 goto onError;
12347
12348 if (x < 0 || x > 0x10ffff) {
12349 PyErr_SetString(PyExc_OverflowError,
12350 "%c arg not in range(0x110000)");
12351 return -1;
12352 }
12353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012355 buf[1] = '\0';
12356 return 1;
12357 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012358
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012360 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012362 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363}
12364
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012365/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012366 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012367*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012368#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012369
Alexander Belopolsky40018472011-02-26 01:02:56 +000012370PyObject *
12371PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 void *fmt;
12374 int fmtkind;
12375 PyObject *result;
12376 Py_UCS4 *res, *res0;
12377 Py_UCS4 max;
12378 int kind;
12379 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012383
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 PyErr_BadInternalCall();
12386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12389 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 fmt = PyUnicode_DATA(uformat);
12392 fmtkind = PyUnicode_KIND(uformat);
12393 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12394 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395
12396 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12398 if (res0 == NULL) {
12399 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402
12403 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 arglen = PyTuple_Size(args);
12405 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406 }
12407 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 arglen = -1;
12409 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012411 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012412 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414
12415 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 if (--rescnt < 0) {
12418 rescnt = fmtcnt + 100;
12419 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12421 if (res0 == NULL){
12422 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 }
12425 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012429 }
12430 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 /* Got a format specifier */
12432 int flags = 0;
12433 Py_ssize_t width = -1;
12434 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 Py_UCS4 c = '\0';
12436 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 int isnumok;
12438 PyObject *v = NULL;
12439 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 void *pbuf;
12441 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 Py_ssize_t len, len1;
12444 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 fmtpos++;
12447 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12448 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 Py_ssize_t keylen;
12450 PyObject *key;
12451 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012452
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 if (dict == NULL) {
12454 PyErr_SetString(PyExc_TypeError,
12455 "format requires a mapping");
12456 goto onError;
12457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012461 /* Skip over balanced parentheses */
12462 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 if (fmtcnt < 0 || pcount > 0) {
12471 PyErr_SetString(PyExc_ValueError,
12472 "incomplete format key");
12473 goto onError;
12474 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012475 key = PyUnicode_Substring((PyObject*)uformat,
12476 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 if (key == NULL)
12478 goto onError;
12479 if (args_owned) {
12480 Py_DECREF(args);
12481 args_owned = 0;
12482 }
12483 args = PyObject_GetItem(dict, key);
12484 Py_DECREF(key);
12485 if (args == NULL) {
12486 goto onError;
12487 }
12488 args_owned = 1;
12489 arglen = -1;
12490 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012494 case '-': flags |= F_LJUST; continue;
12495 case '+': flags |= F_SIGN; continue;
12496 case ' ': flags |= F_BLANK; continue;
12497 case '#': flags |= F_ALT; continue;
12498 case '0': flags |= F_ZERO; continue;
12499 }
12500 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 if (c == '*') {
12503 v = getnextarg(args, arglen, &argidx);
12504 if (v == NULL)
12505 goto onError;
12506 if (!PyLong_Check(v)) {
12507 PyErr_SetString(PyExc_TypeError,
12508 "* wants int");
12509 goto onError;
12510 }
12511 width = PyLong_AsLong(v);
12512 if (width == -1 && PyErr_Occurred())
12513 goto onError;
12514 if (width < 0) {
12515 flags |= F_LJUST;
12516 width = -width;
12517 }
12518 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 }
12521 else if (c >= '0' && c <= '9') {
12522 width = c - '0';
12523 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 if (c < '0' || c > '9')
12526 break;
12527 if ((width*10) / 10 != width) {
12528 PyErr_SetString(PyExc_ValueError,
12529 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 }
12532 width = width*10 + (c - '0');
12533 }
12534 }
12535 if (c == '.') {
12536 prec = 0;
12537 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 if (c == '*') {
12540 v = getnextarg(args, arglen, &argidx);
12541 if (v == NULL)
12542 goto onError;
12543 if (!PyLong_Check(v)) {
12544 PyErr_SetString(PyExc_TypeError,
12545 "* wants int");
12546 goto onError;
12547 }
12548 prec = PyLong_AsLong(v);
12549 if (prec == -1 && PyErr_Occurred())
12550 goto onError;
12551 if (prec < 0)
12552 prec = 0;
12553 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 }
12556 else if (c >= '0' && c <= '9') {
12557 prec = c - '0';
12558 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 if (c < '0' || c > '9')
12561 break;
12562 if ((prec*10) / 10 != prec) {
12563 PyErr_SetString(PyExc_ValueError,
12564 "prec too big");
12565 goto onError;
12566 }
12567 prec = prec*10 + (c - '0');
12568 }
12569 }
12570 } /* prec */
12571 if (fmtcnt >= 0) {
12572 if (c == 'h' || c == 'l' || c == 'L') {
12573 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 }
12576 }
12577 if (fmtcnt < 0) {
12578 PyErr_SetString(PyExc_ValueError,
12579 "incomplete format");
12580 goto onError;
12581 }
12582 if (c != '%') {
12583 v = getnextarg(args, arglen, &argidx);
12584 if (v == NULL)
12585 goto onError;
12586 }
12587 sign = 0;
12588 fill = ' ';
12589 switch (c) {
12590
12591 case '%':
12592 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 len = 1;
12597 break;
12598
12599 case 's':
12600 case 'r':
12601 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012602 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 temp = v;
12604 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012605 }
12606 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 if (c == 's')
12608 temp = PyObject_Str(v);
12609 else if (c == 'r')
12610 temp = PyObject_Repr(v);
12611 else
12612 temp = PyObject_ASCII(v);
12613 if (temp == NULL)
12614 goto onError;
12615 if (PyUnicode_Check(temp))
12616 /* nothing to do */;
12617 else {
12618 Py_DECREF(temp);
12619 PyErr_SetString(PyExc_TypeError,
12620 "%s argument has non-string str()");
12621 goto onError;
12622 }
12623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (PyUnicode_READY(temp) == -1) {
12625 Py_CLEAR(temp);
12626 goto onError;
12627 }
12628 pbuf = PyUnicode_DATA(temp);
12629 kind = PyUnicode_KIND(temp);
12630 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 if (prec >= 0 && len > prec)
12632 len = prec;
12633 break;
12634
12635 case 'i':
12636 case 'd':
12637 case 'u':
12638 case 'o':
12639 case 'x':
12640 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 isnumok = 0;
12642 if (PyNumber_Check(v)) {
12643 PyObject *iobj=NULL;
12644
12645 if (PyLong_Check(v)) {
12646 iobj = v;
12647 Py_INCREF(iobj);
12648 }
12649 else {
12650 iobj = PyNumber_Long(v);
12651 }
12652 if (iobj!=NULL) {
12653 if (PyLong_Check(iobj)) {
12654 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012655 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 Py_DECREF(iobj);
12657 if (!temp)
12658 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (PyUnicode_READY(temp) == -1) {
12660 Py_CLEAR(temp);
12661 goto onError;
12662 }
12663 pbuf = PyUnicode_DATA(temp);
12664 kind = PyUnicode_KIND(temp);
12665 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 sign = 1;
12667 }
12668 else {
12669 Py_DECREF(iobj);
12670 }
12671 }
12672 }
12673 if (!isnumok) {
12674 PyErr_Format(PyExc_TypeError,
12675 "%%%c format: a number is required, "
12676 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12677 goto onError;
12678 }
12679 if (flags & F_ZERO)
12680 fill = '0';
12681 break;
12682
12683 case 'e':
12684 case 'E':
12685 case 'f':
12686 case 'F':
12687 case 'g':
12688 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012689 temp = formatfloat(v, flags, prec, c);
12690 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 if (PyUnicode_READY(temp) == -1) {
12693 Py_CLEAR(temp);
12694 goto onError;
12695 }
12696 pbuf = PyUnicode_DATA(temp);
12697 kind = PyUnicode_KIND(temp);
12698 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 sign = 1;
12700 if (flags & F_ZERO)
12701 fill = '0';
12702 break;
12703
12704 case 'c':
12705 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012707 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 if (len < 0)
12709 goto onError;
12710 break;
12711
12712 default:
12713 PyErr_Format(PyExc_ValueError,
12714 "unsupported format character '%c' (0x%x) "
12715 "at index %zd",
12716 (31<=c && c<=126) ? (char)c : '?',
12717 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 goto onError;
12720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 /* pbuf is initialized here. */
12722 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12725 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12726 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 len--;
12728 }
12729 else if (flags & F_SIGN)
12730 sign = '+';
12731 else if (flags & F_BLANK)
12732 sign = ' ';
12733 else
12734 sign = 0;
12735 }
12736 if (width < len)
12737 width = len;
12738 if (rescnt - (sign != 0) < width) {
12739 reslen -= rescnt;
12740 rescnt = width + fmtcnt + 100;
12741 reslen += rescnt;
12742 if (reslen < 0) {
12743 Py_XDECREF(temp);
12744 PyErr_NoMemory();
12745 goto onError;
12746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12748 if (res0 == 0) {
12749 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 Py_XDECREF(temp);
12751 goto onError;
12752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 }
12755 if (sign) {
12756 if (fill != ' ')
12757 *res++ = sign;
12758 rescnt--;
12759 if (width > len)
12760 width--;
12761 }
12762 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12764 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12767 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 }
12769 rescnt -= 2;
12770 width -= 2;
12771 if (width < 0)
12772 width = 0;
12773 len -= 2;
12774 }
12775 if (width > len && !(flags & F_LJUST)) {
12776 do {
12777 --rescnt;
12778 *res++ = fill;
12779 } while (--width > len);
12780 }
12781 if (fill == ' ') {
12782 if (sign)
12783 *res++ = sign;
12784 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12786 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12787 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12788 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012789 }
12790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 /* Copy all characters, preserving len */
12792 len1 = len;
12793 while (len1--) {
12794 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12795 rescnt--;
12796 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 while (--width >= len) {
12798 --rescnt;
12799 *res++ = ' ';
12800 }
12801 if (dict && (argidx < arglen) && c != '%') {
12802 PyErr_SetString(PyExc_TypeError,
12803 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012804 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 goto onError;
12806 }
12807 Py_XDECREF(temp);
12808 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809 } /* until end */
12810 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 PyErr_SetString(PyExc_TypeError,
12812 "not all arguments converted during string formatting");
12813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 }
12815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816
12817 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12818 if (*res > max)
12819 max = *res;
12820 result = PyUnicode_New(reslen - rescnt, max);
12821 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 kind = PyUnicode_KIND(result);
12824 for (res = res0; res < res0+reslen-rescnt; res++)
12825 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12826 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829 }
12830 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831 return (PyObject *)result;
12832
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 Py_DECREF(uformat);
12836 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 }
12839 return NULL;
12840}
12841
Jeremy Hylton938ace62002-07-17 16:30:39 +000012842static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12844
Tim Peters6d6c1a32001-08-02 04:15:00 +000012845static PyObject *
12846unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12847{
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012849 static char *kwlist[] = {"object", "encoding", "errors", 0};
12850 char *encoding = NULL;
12851 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012852
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 if (type != &PyUnicode_Type)
12854 return unicode_subtype_new(type, args, kwds);
12855 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012857 return NULL;
12858 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012860 if (encoding == NULL && errors == NULL)
12861 return PyObject_Str(x);
12862 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012864}
12865
Guido van Rossume023fe02001-08-30 03:12:59 +000012866static PyObject *
12867unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12868{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012869 PyUnicodeObject *unicode, *self;
12870 Py_ssize_t length, char_size;
12871 int share_wstr, share_utf8;
12872 unsigned int kind;
12873 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012874
Benjamin Peterson14339b62009-01-31 16:36:08 +000012875 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012876
12877 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12878 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012880 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012881 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012882 return NULL;
12883
12884 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12885 if (self == NULL) {
12886 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 return NULL;
12888 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012889 kind = PyUnicode_KIND(unicode);
12890 length = PyUnicode_GET_LENGTH(unicode);
12891
12892 _PyUnicode_LENGTH(self) = length;
12893 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12894 _PyUnicode_STATE(self).interned = 0;
12895 _PyUnicode_STATE(self).kind = kind;
12896 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012897 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012898 _PyUnicode_STATE(self).ready = 1;
12899 _PyUnicode_WSTR(self) = NULL;
12900 _PyUnicode_UTF8_LENGTH(self) = 0;
12901 _PyUnicode_UTF8(self) = NULL;
12902 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012903 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012904
12905 share_utf8 = 0;
12906 share_wstr = 0;
12907 if (kind == PyUnicode_1BYTE_KIND) {
12908 char_size = 1;
12909 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12910 share_utf8 = 1;
12911 }
12912 else if (kind == PyUnicode_2BYTE_KIND) {
12913 char_size = 2;
12914 if (sizeof(wchar_t) == 2)
12915 share_wstr = 1;
12916 }
12917 else {
12918 assert(kind == PyUnicode_4BYTE_KIND);
12919 char_size = 4;
12920 if (sizeof(wchar_t) == 4)
12921 share_wstr = 1;
12922 }
12923
12924 /* Ensure we won't overflow the length. */
12925 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12926 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012929 data = PyObject_MALLOC((length + 1) * char_size);
12930 if (data == NULL) {
12931 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 goto onError;
12933 }
12934
Victor Stinnerc3c74152011-10-02 20:39:55 +020012935 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012936 if (share_utf8) {
12937 _PyUnicode_UTF8_LENGTH(self) = length;
12938 _PyUnicode_UTF8(self) = data;
12939 }
12940 if (share_wstr) {
12941 _PyUnicode_WSTR_LENGTH(self) = length;
12942 _PyUnicode_WSTR(self) = (wchar_t *)data;
12943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012945 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12946 PyUnicode_KIND_SIZE(kind, length + 1));
12947 Py_DECREF(unicode);
12948 return (PyObject *)self;
12949
12950onError:
12951 Py_DECREF(unicode);
12952 Py_DECREF(self);
12953 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012954}
12955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012956PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012958\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012959Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012960encoding defaults to the current default string encoding.\n\
12961errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012962
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012963static PyObject *unicode_iter(PyObject *seq);
12964
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012966 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012967 "str", /* tp_name */
12968 sizeof(PyUnicodeObject), /* tp_size */
12969 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012971 (destructor)unicode_dealloc, /* tp_dealloc */
12972 0, /* tp_print */
12973 0, /* tp_getattr */
12974 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012975 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 unicode_repr, /* tp_repr */
12977 &unicode_as_number, /* tp_as_number */
12978 &unicode_as_sequence, /* tp_as_sequence */
12979 &unicode_as_mapping, /* tp_as_mapping */
12980 (hashfunc) unicode_hash, /* tp_hash*/
12981 0, /* tp_call*/
12982 (reprfunc) unicode_str, /* tp_str */
12983 PyObject_GenericGetAttr, /* tp_getattro */
12984 0, /* tp_setattro */
12985 0, /* tp_as_buffer */
12986 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 unicode_doc, /* tp_doc */
12989 0, /* tp_traverse */
12990 0, /* tp_clear */
12991 PyUnicode_RichCompare, /* tp_richcompare */
12992 0, /* tp_weaklistoffset */
12993 unicode_iter, /* tp_iter */
12994 0, /* tp_iternext */
12995 unicode_methods, /* tp_methods */
12996 0, /* tp_members */
12997 0, /* tp_getset */
12998 &PyBaseObject_Type, /* tp_base */
12999 0, /* tp_dict */
13000 0, /* tp_descr_get */
13001 0, /* tp_descr_set */
13002 0, /* tp_dictoffset */
13003 0, /* tp_init */
13004 0, /* tp_alloc */
13005 unicode_new, /* tp_new */
13006 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007};
13008
13009/* Initialize the Unicode implementation */
13010
Thomas Wouters78890102000-07-22 19:25:51 +000013011void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013013 int i;
13014
Thomas Wouters477c8d52006-05-27 19:21:47 +000013015 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013017 0x000A, /* LINE FEED */
13018 0x000D, /* CARRIAGE RETURN */
13019 0x001C, /* FILE SEPARATOR */
13020 0x001D, /* GROUP SEPARATOR */
13021 0x001E, /* RECORD SEPARATOR */
13022 0x0085, /* NEXT LINE */
13023 0x2028, /* LINE SEPARATOR */
13024 0x2029, /* PARAGRAPH SEPARATOR */
13025 };
13026
Fred Drakee4315f52000-05-09 19:53:39 +000013027 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013028 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013029 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013032 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013033 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013034 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013035 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013036
13037 /* initialize the linebreak bloom filter */
13038 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013040 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013041
13042 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043}
13044
13045/* Finalize the Unicode implementation */
13046
Christian Heimesa156e092008-02-16 07:38:31 +000013047int
13048PyUnicode_ClearFreeList(void)
13049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013051}
13052
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053void
Thomas Wouters78890102000-07-22 19:25:51 +000013054_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013056 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013058 Py_XDECREF(unicode_empty);
13059 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013060
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013061 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 if (unicode_latin1[i]) {
13063 Py_DECREF(unicode_latin1[i]);
13064 unicode_latin1[i] = NULL;
13065 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013066 }
Christian Heimesa156e092008-02-16 07:38:31 +000013067 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013069
Walter Dörwald16807132007-05-25 13:52:07 +000013070void
13071PyUnicode_InternInPlace(PyObject **p)
13072{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013073 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13074 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013075#ifdef Py_DEBUG
13076 assert(s != NULL);
13077 assert(_PyUnicode_CHECK(s));
13078#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013080 return;
13081#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 /* If it's a subclass, we don't really know what putting
13083 it in the interned dict might do. */
13084 if (!PyUnicode_CheckExact(s))
13085 return;
13086 if (PyUnicode_CHECK_INTERNED(s))
13087 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013088 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013089 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 return;
13091 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013092 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 if (interned == NULL) {
13094 interned = PyDict_New();
13095 if (interned == NULL) {
13096 PyErr_Clear(); /* Don't leave an exception */
13097 return;
13098 }
13099 }
13100 /* It might be that the GetItem call fails even
13101 though the key is present in the dictionary,
13102 namely when this happens during a stack overflow. */
13103 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013106
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 if (t) {
13108 Py_INCREF(t);
13109 Py_DECREF(*p);
13110 *p = t;
13111 return;
13112 }
Walter Dörwald16807132007-05-25 13:52:07 +000013113
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 PyThreadState_GET()->recursion_critical = 1;
13115 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13116 PyErr_Clear();
13117 PyThreadState_GET()->recursion_critical = 0;
13118 return;
13119 }
13120 PyThreadState_GET()->recursion_critical = 0;
13121 /* The two references in interned are not counted by refcnt.
13122 The deallocator will take care of this */
13123 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013125}
13126
13127void
13128PyUnicode_InternImmortal(PyObject **p)
13129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13131
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 PyUnicode_InternInPlace(p);
13133 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 Py_INCREF(*p);
13136 }
Walter Dörwald16807132007-05-25 13:52:07 +000013137}
13138
13139PyObject *
13140PyUnicode_InternFromString(const char *cp)
13141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 PyObject *s = PyUnicode_FromString(cp);
13143 if (s == NULL)
13144 return NULL;
13145 PyUnicode_InternInPlace(&s);
13146 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013147}
13148
Alexander Belopolsky40018472011-02-26 01:02:56 +000013149void
13150_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 PyObject *keys;
13153 PyUnicodeObject *s;
13154 Py_ssize_t i, n;
13155 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013156
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 if (interned == NULL || !PyDict_Check(interned))
13158 return;
13159 keys = PyDict_Keys(interned);
13160 if (keys == NULL || !PyList_Check(keys)) {
13161 PyErr_Clear();
13162 return;
13163 }
Walter Dörwald16807132007-05-25 13:52:07 +000013164
Benjamin Peterson14339b62009-01-31 16:36:08 +000013165 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13166 detector, interned unicode strings are not forcibly deallocated;
13167 rather, we give them their stolen references back, and then clear
13168 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013169
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 n = PyList_GET_SIZE(keys);
13171 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 for (i = 0; i < n; i++) {
13174 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 if (PyUnicode_READY(s) == -1)
13176 fprintf(stderr, "could not ready string\n");
13177 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013178 case SSTATE_NOT_INTERNED:
13179 /* XXX Shouldn't happen */
13180 break;
13181 case SSTATE_INTERNED_IMMORTAL:
13182 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 break;
13185 case SSTATE_INTERNED_MORTAL:
13186 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013188 break;
13189 default:
13190 Py_FatalError("Inconsistent interned string state.");
13191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 }
13194 fprintf(stderr, "total size of all interned strings: "
13195 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13196 "mortal/immortal\n", mortal_size, immortal_size);
13197 Py_DECREF(keys);
13198 PyDict_Clear(interned);
13199 Py_DECREF(interned);
13200 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013201}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013202
13203
13204/********************* Unicode Iterator **************************/
13205
13206typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013207 PyObject_HEAD
13208 Py_ssize_t it_index;
13209 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013210} unicodeiterobject;
13211
13212static void
13213unicodeiter_dealloc(unicodeiterobject *it)
13214{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 _PyObject_GC_UNTRACK(it);
13216 Py_XDECREF(it->it_seq);
13217 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013218}
13219
13220static int
13221unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13222{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 Py_VISIT(it->it_seq);
13224 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013225}
13226
13227static PyObject *
13228unicodeiter_next(unicodeiterobject *it)
13229{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013230 PyUnicodeObject *seq;
13231 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013232
Benjamin Peterson14339b62009-01-31 16:36:08 +000013233 assert(it != NULL);
13234 seq = it->it_seq;
13235 if (seq == NULL)
13236 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013237 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13240 int kind = PyUnicode_KIND(seq);
13241 void *data = PyUnicode_DATA(seq);
13242 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13243 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 if (item != NULL)
13245 ++it->it_index;
13246 return item;
13247 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013248
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 Py_DECREF(seq);
13250 it->it_seq = NULL;
13251 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013252}
13253
13254static PyObject *
13255unicodeiter_len(unicodeiterobject *it)
13256{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013257 Py_ssize_t len = 0;
13258 if (it->it_seq)
13259 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13260 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013261}
13262
13263PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13264
13265static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013269};
13270
13271PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013272 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13273 "str_iterator", /* tp_name */
13274 sizeof(unicodeiterobject), /* tp_basicsize */
13275 0, /* tp_itemsize */
13276 /* methods */
13277 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13278 0, /* tp_print */
13279 0, /* tp_getattr */
13280 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013281 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013282 0, /* tp_repr */
13283 0, /* tp_as_number */
13284 0, /* tp_as_sequence */
13285 0, /* tp_as_mapping */
13286 0, /* tp_hash */
13287 0, /* tp_call */
13288 0, /* tp_str */
13289 PyObject_GenericGetAttr, /* tp_getattro */
13290 0, /* tp_setattro */
13291 0, /* tp_as_buffer */
13292 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13293 0, /* tp_doc */
13294 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13295 0, /* tp_clear */
13296 0, /* tp_richcompare */
13297 0, /* tp_weaklistoffset */
13298 PyObject_SelfIter, /* tp_iter */
13299 (iternextfunc)unicodeiter_next, /* tp_iternext */
13300 unicodeiter_methods, /* tp_methods */
13301 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013302};
13303
13304static PyObject *
13305unicode_iter(PyObject *seq)
13306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013307 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013308
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 if (!PyUnicode_Check(seq)) {
13310 PyErr_BadInternalCall();
13311 return NULL;
13312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 if (PyUnicode_READY(seq) == -1)
13314 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013315 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13316 if (it == NULL)
13317 return NULL;
13318 it->it_index = 0;
13319 Py_INCREF(seq);
13320 it->it_seq = (PyUnicodeObject *)seq;
13321 _PyObject_GC_TRACK(it);
13322 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013323}
13324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325#define UNIOP(x) Py_UNICODE_##x
13326#define UNIOP_t Py_UNICODE
13327#include "uniops.h"
13328#undef UNIOP
13329#undef UNIOP_t
13330#define UNIOP(x) Py_UCS4_##x
13331#define UNIOP_t Py_UCS4
13332#include "uniops.h"
13333#undef UNIOP
13334#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013335
Victor Stinner71133ff2010-09-01 23:43:53 +000013336Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013337PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013338{
13339 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13340 Py_UNICODE *copy;
13341 Py_ssize_t size;
13342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343 if (!PyUnicode_Check(unicode)) {
13344 PyErr_BadArgument();
13345 return NULL;
13346 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013347 /* Ensure we won't overflow the size. */
13348 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13349 PyErr_NoMemory();
13350 return NULL;
13351 }
13352 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13353 size *= sizeof(Py_UNICODE);
13354 copy = PyMem_Malloc(size);
13355 if (copy == NULL) {
13356 PyErr_NoMemory();
13357 return NULL;
13358 }
13359 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13360 return copy;
13361}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013362
Georg Brandl66c221e2010-10-14 07:04:07 +000013363/* A _string module, to export formatter_parser and formatter_field_name_split
13364 to the string.Formatter class implemented in Python. */
13365
13366static PyMethodDef _string_methods[] = {
13367 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13368 METH_O, PyDoc_STR("split the argument as a field name")},
13369 {"formatter_parser", (PyCFunction) formatter_parser,
13370 METH_O, PyDoc_STR("parse the argument as a format string")},
13371 {NULL, NULL}
13372};
13373
13374static struct PyModuleDef _string_module = {
13375 PyModuleDef_HEAD_INIT,
13376 "_string",
13377 PyDoc_STR("string helper module"),
13378 0,
13379 _string_methods,
13380 NULL,
13381 NULL,
13382 NULL,
13383 NULL
13384};
13385
13386PyMODINIT_FUNC
13387PyInit__string(void)
13388{
13389 return PyModule_Create(&_string_module);
13390}
13391
13392
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013393#ifdef __cplusplus
13394}
13395#endif