blob: 84c8dcadee273ba6e9511cd45737e11971ef5970 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner03490912011-10-03 23:45:12 +0200154/* true if the Unicode object has an allocated wstr memory block
155 (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op) \
157 (assert(_PyUnicode_CHECK(op)), \
158 (_PyUnicode_WSTR(op) && \
159 (!PyUnicode_IS_READY(op) || \
160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
Victor Stinner910337b2011-10-03 03:20:16 +0200162/* Generic helper macro to convert characters of different types.
163 from_type and to_type have to be valid type names, begin and end
164 are pointers to the source characters which should be of type
165 "from_type *". to is a pointer of type "to_type *" and points to the
166 buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168 do { \
169 const from_type *iter_; to_type *to_; \
170 for (iter_ = (begin), to_ = (to_type *)(to); \
171 iter_ < (end); \
172 ++iter_, ++to_) { \
173 *to_ = (to_type)*iter_; \
174 } \
175 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200176
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
Walter Dörwald16807132007-05-25 13:52:07 +0000180/* This dictionary holds all interned unicode strings. Note that references
181 to strings in this dictionary are *not* counted in the string's ob_refcnt.
182 When the interned string reaches a refcnt of 0 the string deallocation
183 function will delete the reference from this dictionary.
184
185 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000187*/
188static PyObject *interned;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200191static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200231
Alexander Belopolsky40018472011-02-26 01:02:56 +0000232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 PyObject **errorHandler,const char *encoding, const char *reason,
235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
Alexander Belopolsky40018472011-02-26 01:02:56 +0000238static void
239raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300240 const char *encoding,
241 const Py_UNICODE *unicode, Py_ssize_t size,
242 Py_ssize_t startpos, Py_ssize_t endpos,
243 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000244
Christian Heimes190d79e2008-01-30 11:58:22 +0000245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000248/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000249/* 0x000B, * LINE TABULATION */
250/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000251/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000252 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x001C, * FILE SEPARATOR */
255/* 0x001D, * GROUP SEPARATOR */
256/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 1, 1, 1, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000271};
272
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000276PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000277{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000278#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 /* This is actually an illegal character, so it should
282 not be passed to unichr. */
283 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#endif
285}
286
Victor Stinner910337b2011-10-03 03:20:16 +0200287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291 PyASCIIObject *ascii;
292 unsigned int kind;
293
294 assert(PyUnicode_Check(op));
295
296 ascii = (PyASCIIObject *)op;
297 kind = ascii->state.kind;
298
Victor Stinnera3b334d2011-10-03 13:53:37 +0200299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200300 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200301 assert(ascii->state.ready == 1);
302 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200303 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200305 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200306
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 if (ascii->state.compact == 1) {
308 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(kind == PyUnicode_1BYTE_KIND
310 || kind == PyUnicode_2BYTE_KIND
311 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 assert (compact->utf8 != data);
315 } else {
316 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
317
318 data = unicode->data.any;
319 if (kind == PyUnicode_WCHAR_KIND) {
320 assert(ascii->state.compact == 0);
321 assert(ascii->state.ascii == 0);
322 assert(ascii->state.ready == 0);
323 assert(ascii->wstr != NULL);
324 assert(data == NULL);
325 assert(compact->utf8 == NULL);
326 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
327 }
328 else {
329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
332 assert(ascii->state.compact == 0);
333 assert(ascii->state.ready == 1);
334 assert(data != NULL);
335 if (ascii->state.ascii) {
336 assert (compact->utf8 == data);
337 assert (compact->utf8_length == ascii->length);
338 }
339 else
340 assert (compact->utf8 != data);
341 }
342 }
343 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200344 if (
345#if SIZEOF_WCHAR_T == 2
346 kind == PyUnicode_2BYTE_KIND
347#else
348 kind == PyUnicode_4BYTE_KIND
349#endif
350 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200351 {
352 assert(ascii->wstr == data);
353 assert(compact->wstr_length == ascii->length);
354 } else
355 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200356 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200357
358 if (compact->utf8 == NULL)
359 assert(compact->utf8_length == 0);
360 if (ascii->wstr == NULL)
361 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200362 }
363 return 1;
364}
365#endif
366
Thomas Wouters477c8d52006-05-27 19:21:47 +0000367/* --- Bloom Filters ----------------------------------------------------- */
368
369/* stuff to implement simple "bloom filters" for Unicode characters.
370 to keep things simple, we use a single bitmask, using the least 5
371 bits from each unicode characters as the bit index. */
372
373/* the linebreak mask is set up by Unicode_Init below */
374
Antoine Pitrouf068f942010-01-13 14:19:12 +0000375#if LONG_BIT >= 128
376#define BLOOM_WIDTH 128
377#elif LONG_BIT >= 64
378#define BLOOM_WIDTH 64
379#elif LONG_BIT >= 32
380#define BLOOM_WIDTH 32
381#else
382#error "LONG_BIT is smaller than 32"
383#endif
384
Thomas Wouters477c8d52006-05-27 19:21:47 +0000385#define BLOOM_MASK unsigned long
386
387static BLOOM_MASK bloom_linebreak;
388
Antoine Pitrouf068f942010-01-13 14:19:12 +0000389#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
390#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000391
Benjamin Peterson29060642009-01-31 22:14:21 +0000392#define BLOOM_LINEBREAK(ch) \
393 ((ch) < 128U ? ascii_linebreak[(ch)] : \
394 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000395
Alexander Belopolsky40018472011-02-26 01:02:56 +0000396Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200397make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398{
399 /* calculate simple bloom-style bitmask for a given unicode string */
400
Antoine Pitrouf068f942010-01-13 14:19:12 +0000401 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402 Py_ssize_t i;
403
404 mask = 0;
405 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000407
408 return mask;
409}
410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411#define BLOOM_MEMBER(mask, chr, str) \
412 (BLOOM(mask, chr) \
413 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415/* --- Unicode Object ----------------------------------------------------- */
416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200417static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
419
420Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
421 Py_ssize_t size, Py_UCS4 ch,
422 int direction)
423{
424 /* like wcschr, but doesn't stop at NULL characters */
425 Py_ssize_t i;
426 if (direction == 1) {
427 for(i = 0; i < size; i++)
428 if (PyUnicode_READ(kind, s, i) == ch)
429 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
430 }
431 else {
432 for(i = size-1; i >= 0; i--)
433 if (PyUnicode_READ(kind, s, i) == ch)
434 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
435 }
436 return NULL;
437}
438
Victor Stinnerfe226c02011-10-03 03:52:20 +0200439static PyObject*
440resize_compact(PyObject *unicode, Py_ssize_t length)
441{
442 Py_ssize_t char_size;
443 Py_ssize_t struct_size;
444 Py_ssize_t new_size;
445 int share_wstr;
446
447 assert(PyUnicode_IS_READY(unicode));
448 char_size = PyUnicode_CHARACTER_SIZE(unicode);
449 if (PyUnicode_IS_COMPACT_ASCII(unicode))
450 struct_size = sizeof(PyASCIIObject);
451 else
452 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200453 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200454
455 _Py_DEC_REFTOTAL;
456 _Py_ForgetReference(unicode);
457
458 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
459 PyErr_NoMemory();
460 return NULL;
461 }
462 new_size = (struct_size + (length + 1) * char_size);
463
464 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
465 if (unicode == NULL) {
466 PyObject_Del(unicode);
467 PyErr_NoMemory();
468 return NULL;
469 }
470 _Py_NewReference(unicode);
471 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200472 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200473 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200474 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
475 _PyUnicode_WSTR_LENGTH(unicode) = length;
476 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200477 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
478 length, 0);
479 return unicode;
480}
481
Alexander Belopolsky40018472011-02-26 01:02:56 +0000482static int
Victor Stinner95663112011-10-04 01:03:50 +0200483resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
Victor Stinner95663112011-10-04 01:03:50 +0200485 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200486 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200487 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000488
Victor Stinner95663112011-10-04 01:03:50 +0200489 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490
491 if (PyUnicode_IS_READY(unicode)) {
492 Py_ssize_t char_size;
493 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200494 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200495 void *data;
496
497 data = _PyUnicode_DATA_ANY(unicode);
498 assert(data != NULL);
499 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200500 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
501 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200502 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
503 {
504 PyObject_DEL(_PyUnicode_UTF8(unicode));
505 _PyUnicode_UTF8(unicode) = NULL;
506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
507 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508
509 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
510 PyErr_NoMemory();
511 return -1;
512 }
513 new_size = (length + 1) * char_size;
514
515 data = (PyObject *)PyObject_REALLOC(data, new_size);
516 if (data == NULL) {
517 PyErr_NoMemory();
518 return -1;
519 }
520 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200521 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200522 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200523 _PyUnicode_WSTR_LENGTH(unicode) = length;
524 }
525 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200526 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200527 _PyUnicode_UTF8_LENGTH(unicode) = length;
528 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200529 _PyUnicode_LENGTH(unicode) = length;
530 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200531 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
532 _PyUnicode_CHECK(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200533 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 }
Victor Stinner95663112011-10-04 01:03:50 +0200536 assert(_PyUnicode_WSTR(unicode) != NULL);
537
538 /* check for integer overflow */
539 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
540 PyErr_NoMemory();
541 return -1;
542 }
543 wstr = _PyUnicode_WSTR(unicode);
544 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
545 if (!wstr) {
546 PyErr_NoMemory();
547 return -1;
548 }
549 _PyUnicode_WSTR(unicode) = wstr;
550 _PyUnicode_WSTR(unicode)[length] = 0;
551 _PyUnicode_WSTR_LENGTH(unicode) = length;
552 _PyUnicode_CHECK(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 return 0;
554}
555
Victor Stinnerfe226c02011-10-03 03:52:20 +0200556static PyObject*
557resize_copy(PyObject *unicode, Py_ssize_t length)
558{
559 Py_ssize_t copy_length;
560 if (PyUnicode_IS_COMPACT(unicode)) {
561 PyObject *copy;
562 assert(PyUnicode_IS_READY(unicode));
563
564 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
565 if (copy == NULL)
566 return NULL;
567
568 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
569 if (PyUnicode_CopyCharacters(copy, 0,
570 unicode, 0,
571 copy_length) < 0)
572 {
573 Py_DECREF(copy);
574 return NULL;
575 }
576 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200577 }
578 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200579 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200582 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200583 if (w == NULL)
584 return NULL;
585 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
586 copy_length = Py_MIN(copy_length, length);
587 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
588 copy_length);
589 return (PyObject*)w;
590 }
591}
592
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000594 Ux0000 terminated; some code (e.g. new_identifier)
595 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000598 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599
600*/
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602#ifdef Py_DEBUG
603int unicode_old_new_calls = 0;
604#endif
605
Alexander Belopolsky40018472011-02-26 01:02:56 +0000606static PyUnicodeObject *
607_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
609 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611
Thomas Wouters477c8d52006-05-27 19:21:47 +0000612 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613 if (length == 0 && unicode_empty != NULL) {
614 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200615 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000616 }
617
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000618 /* Ensure we won't overflow the size. */
619 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
620 return (PyUnicodeObject *)PyErr_NoMemory();
621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622 if (length < 0) {
623 PyErr_SetString(PyExc_SystemError,
624 "Negative size passed to _PyUnicode_New");
625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 }
627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628#ifdef Py_DEBUG
629 ++unicode_old_new_calls;
630#endif
631
632 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
633 if (unicode == NULL)
634 return NULL;
635 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
636 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
637 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 PyErr_NoMemory();
639 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641
Jeremy Hyltond8082792003-09-16 19:41:39 +0000642 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000643 * the caller fails before initializing str -- unicode_resize()
644 * reads str[0], and the Keep-Alive optimization can keep memory
645 * allocated for str alive across a call to unicode_dealloc(unicode).
646 * We don't want unicode_resize to read uninitialized memory in
647 * that case.
648 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649 _PyUnicode_WSTR(unicode)[0] = 0;
650 _PyUnicode_WSTR(unicode)[length] = 0;
651 _PyUnicode_WSTR_LENGTH(unicode) = length;
652 _PyUnicode_HASH(unicode) = -1;
653 _PyUnicode_STATE(unicode).interned = 0;
654 _PyUnicode_STATE(unicode).kind = 0;
655 _PyUnicode_STATE(unicode).compact = 0;
656 _PyUnicode_STATE(unicode).ready = 0;
657 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200658 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200660 _PyUnicode_UTF8(unicode) = NULL;
661 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000663
Benjamin Peterson29060642009-01-31 22:14:21 +0000664 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000665 /* XXX UNREF/NEWREF interface should be more symmetrical */
666 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000667 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000668 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670}
671
Victor Stinnerf42dc442011-10-02 23:33:16 +0200672static const char*
673unicode_kind_name(PyObject *unicode)
674{
Victor Stinner42dfd712011-10-03 14:41:45 +0200675 /* don't check consistency: unicode_kind_name() is called from
676 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200677 if (!PyUnicode_IS_COMPACT(unicode))
678 {
679 if (!PyUnicode_IS_READY(unicode))
680 return "wstr";
681 switch(PyUnicode_KIND(unicode))
682 {
683 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200684 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685 return "legacy ascii";
686 else
687 return "legacy latin1";
688 case PyUnicode_2BYTE_KIND:
689 return "legacy UCS2";
690 case PyUnicode_4BYTE_KIND:
691 return "legacy UCS4";
692 default:
693 return "<legacy invalid kind>";
694 }
695 }
696 assert(PyUnicode_IS_READY(unicode));
697 switch(PyUnicode_KIND(unicode))
698 {
699 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200700 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200701 return "ascii";
702 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200703 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200704 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200705 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200706 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200707 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200708 default:
709 return "<invalid compact kind>";
710 }
711}
712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713#ifdef Py_DEBUG
714int unicode_new_new_calls = 0;
715
716/* Functions wrapping macros for use in debugger */
717char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200718 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719}
720
721void *_PyUnicode_compact_data(void *unicode) {
722 return _PyUnicode_COMPACT_DATA(unicode);
723}
724void *_PyUnicode_data(void *unicode){
725 printf("obj %p\n", unicode);
726 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
727 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
728 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
729 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
730 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
731 return PyUnicode_DATA(unicode);
732}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200733
734void
735_PyUnicode_Dump(PyObject *op)
736{
737 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200738 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
739 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
740 void *data;
741 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
742 if (ascii->state.compact)
743 data = (compact + 1);
744 else
745 data = unicode->data.any;
746 if (ascii->wstr == data)
747 printf("shared ");
748 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200749 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200750 printf(" (%zu), ", compact->wstr_length);
751 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
752 printf("shared ");
753 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200754 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200755 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200756}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757#endif
758
759PyObject *
760PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
761{
762 PyObject *obj;
763 PyCompactUnicodeObject *unicode;
764 void *data;
765 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200766 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 Py_ssize_t char_size;
768 Py_ssize_t struct_size;
769
770 /* Optimization for empty strings */
771 if (size == 0 && unicode_empty != NULL) {
772 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200773 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774 }
775
776#ifdef Py_DEBUG
777 ++unicode_new_new_calls;
778#endif
779
Victor Stinner9e9d6892011-10-04 01:02:02 +0200780 is_ascii = 0;
781 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200782 struct_size = sizeof(PyCompactUnicodeObject);
783 if (maxchar < 128) {
784 kind_state = PyUnicode_1BYTE_KIND;
785 char_size = 1;
786 is_ascii = 1;
787 struct_size = sizeof(PyASCIIObject);
788 }
789 else if (maxchar < 256) {
790 kind_state = PyUnicode_1BYTE_KIND;
791 char_size = 1;
792 }
793 else if (maxchar < 65536) {
794 kind_state = PyUnicode_2BYTE_KIND;
795 char_size = 2;
796 if (sizeof(wchar_t) == 2)
797 is_sharing = 1;
798 }
799 else {
800 kind_state = PyUnicode_4BYTE_KIND;
801 char_size = 4;
802 if (sizeof(wchar_t) == 4)
803 is_sharing = 1;
804 }
805
806 /* Ensure we won't overflow the size. */
807 if (size < 0) {
808 PyErr_SetString(PyExc_SystemError,
809 "Negative size passed to PyUnicode_New");
810 return NULL;
811 }
812 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
813 return PyErr_NoMemory();
814
815 /* Duplicated allocation code from _PyObject_New() instead of a call to
816 * PyObject_New() so we are able to allocate space for the object and
817 * it's data buffer.
818 */
819 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
820 if (obj == NULL)
821 return PyErr_NoMemory();
822 obj = PyObject_INIT(obj, &PyUnicode_Type);
823 if (obj == NULL)
824 return NULL;
825
826 unicode = (PyCompactUnicodeObject *)obj;
827 if (is_ascii)
828 data = ((PyASCIIObject*)obj) + 1;
829 else
830 data = unicode + 1;
831 _PyUnicode_LENGTH(unicode) = size;
832 _PyUnicode_HASH(unicode) = -1;
833 _PyUnicode_STATE(unicode).interned = 0;
834 _PyUnicode_STATE(unicode).kind = kind_state;
835 _PyUnicode_STATE(unicode).compact = 1;
836 _PyUnicode_STATE(unicode).ready = 1;
837 _PyUnicode_STATE(unicode).ascii = is_ascii;
838 if (is_ascii) {
839 ((char*)data)[size] = 0;
840 _PyUnicode_WSTR(unicode) = NULL;
841 }
842 else if (kind_state == PyUnicode_1BYTE_KIND) {
843 ((char*)data)[size] = 0;
844 _PyUnicode_WSTR(unicode) = NULL;
845 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200847 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 }
849 else {
850 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200851 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 if (kind_state == PyUnicode_2BYTE_KIND)
853 ((Py_UCS2*)data)[size] = 0;
854 else /* kind_state == PyUnicode_4BYTE_KIND */
855 ((Py_UCS4*)data)[size] = 0;
856 if (is_sharing) {
857 _PyUnicode_WSTR_LENGTH(unicode) = size;
858 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
859 }
860 else {
861 _PyUnicode_WSTR_LENGTH(unicode) = 0;
862 _PyUnicode_WSTR(unicode) = NULL;
863 }
864 }
865 return obj;
866}
867
868#if SIZEOF_WCHAR_T == 2
869/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
870 will decode surrogate pairs, the other conversions are implemented as macros
871 for efficency.
872
873 This function assumes that unicode can hold one more code point than wstr
874 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200875static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
877 PyUnicodeObject *unicode)
878{
879 const wchar_t *iter;
880 Py_UCS4 *ucs4_out;
881
Victor Stinner910337b2011-10-03 03:20:16 +0200882 assert(unicode != NULL);
883 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
885 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
886
887 for (iter = begin; iter < end; ) {
888 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
889 _PyUnicode_GET_LENGTH(unicode)));
890 if (*iter >= 0xD800 && *iter <= 0xDBFF
891 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
892 {
893 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
894 iter += 2;
895 }
896 else {
897 *ucs4_out++ = *iter;
898 iter++;
899 }
900 }
901 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904}
905#endif
906
Victor Stinnercd9950f2011-10-02 00:34:53 +0200907static int
908_PyUnicode_Dirty(PyObject *unicode)
909{
Victor Stinner910337b2011-10-03 03:20:16 +0200910 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200911 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200912 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200913 "Cannot modify a string having more than 1 reference");
914 return -1;
915 }
916 _PyUnicode_DIRTY(unicode);
917 return 0;
918}
919
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200920Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
922 PyObject *from, Py_ssize_t from_start,
923 Py_ssize_t how_many)
924{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200925 unsigned int from_kind, to_kind;
926 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927
Victor Stinnerb1536152011-09-30 02:26:10 +0200928 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
929 PyErr_BadInternalCall();
930 return -1;
931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932
933 if (PyUnicode_READY(from))
934 return -1;
935 if (PyUnicode_READY(to))
936 return -1;
937
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200938 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200939 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200940 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200941 "Cannot write %zi characters at %zi "
942 "in a string of %zi characters",
943 how_many, to_start, PyUnicode_GET_LENGTH(to));
944 return -1;
945 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200946 if (how_many == 0)
947 return 0;
948
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200950 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200953 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200955 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956
Victor Stinnerf42dc442011-10-02 23:33:16 +0200957 if (from_kind == to_kind
958 /* deny latin1 => ascii */
959 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
960 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200962 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200963 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964 + PyUnicode_KIND_SIZE(from_kind, from_start),
965 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200967 else if (from_kind == PyUnicode_1BYTE_KIND
968 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200969 {
970 _PyUnicode_CONVERT_BYTES(
971 Py_UCS1, Py_UCS2,
972 PyUnicode_1BYTE_DATA(from) + from_start,
973 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
974 PyUnicode_2BYTE_DATA(to) + to_start
975 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200976 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200977 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200978 && to_kind == PyUnicode_4BYTE_KIND)
979 {
980 _PyUnicode_CONVERT_BYTES(
981 Py_UCS1, Py_UCS4,
982 PyUnicode_1BYTE_DATA(from) + from_start,
983 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
984 PyUnicode_4BYTE_DATA(to) + to_start
985 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200986 }
987 else if (from_kind == PyUnicode_2BYTE_KIND
988 && to_kind == PyUnicode_4BYTE_KIND)
989 {
990 _PyUnicode_CONVERT_BYTES(
991 Py_UCS2, Py_UCS4,
992 PyUnicode_2BYTE_DATA(from) + from_start,
993 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
994 PyUnicode_4BYTE_DATA(to) + to_start
995 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200996 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200997 else {
998 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200999
1000 /* check if max_char(from substring) <= max_char(to) */
1001 if (from_kind > to_kind
1002 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001003 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001004 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001005 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001006 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001007 /* slow path to check for character overflow */
1008 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1009 Py_UCS4 ch, maxchar;
1010 Py_ssize_t i;
1011
1012 maxchar = 0;
1013 invalid_kinds = 0;
1014 for (i=0; i < how_many; i++) {
1015 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1016 if (ch > maxchar) {
1017 maxchar = ch;
1018 if (maxchar > to_maxchar) {
1019 invalid_kinds = 1;
1020 break;
1021 }
1022 }
1023 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1024 }
1025 }
1026 else
1027 invalid_kinds = 1;
1028 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001029 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001030 "Cannot copy %s characters "
1031 "into a string of %s characters",
1032 unicode_kind_name(from),
1033 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001034 return -1;
1035 }
1036 }
1037 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038}
1039
Victor Stinner17222162011-09-28 22:15:37 +02001040/* Find the maximum code point and count the number of surrogate pairs so a
1041 correct string length can be computed before converting a string to UCS4.
1042 This function counts single surrogates as a character and not as a pair.
1043
1044 Return 0 on success, or -1 on error. */
1045static int
1046find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1047 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048{
1049 const wchar_t *iter;
1050
Victor Stinnerc53be962011-10-02 21:33:54 +02001051 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 if (num_surrogates == NULL || maxchar == NULL) {
1053 PyErr_SetString(PyExc_SystemError,
1054 "unexpected NULL arguments to "
1055 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1056 return -1;
1057 }
1058
1059 *num_surrogates = 0;
1060 *maxchar = 0;
1061
1062 for (iter = begin; iter < end; ) {
1063 if (*iter > *maxchar)
1064 *maxchar = *iter;
1065#if SIZEOF_WCHAR_T == 2
1066 if (*iter >= 0xD800 && *iter <= 0xDBFF
1067 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1068 {
1069 Py_UCS4 surrogate_val;
1070 surrogate_val = (((iter[0] & 0x3FF)<<10)
1071 | (iter[1] & 0x3FF)) + 0x10000;
1072 ++(*num_surrogates);
1073 if (surrogate_val > *maxchar)
1074 *maxchar = surrogate_val;
1075 iter += 2;
1076 }
1077 else
1078 iter++;
1079#else
1080 iter++;
1081#endif
1082 }
1083 return 0;
1084}
1085
1086#ifdef Py_DEBUG
1087int unicode_ready_calls = 0;
1088#endif
1089
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001090static int
1091unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001092{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001093 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 wchar_t *end;
1095 Py_UCS4 maxchar = 0;
1096 Py_ssize_t num_surrogates;
1097#if SIZEOF_WCHAR_T == 2
1098 Py_ssize_t length_wo_surrogates;
1099#endif
1100
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001101 assert(p_obj != NULL);
1102 unicode = (PyUnicodeObject *)*p_obj;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001105 strings were created using _PyObject_New() and where no canonical
1106 representation (the str field) has been set yet aka strings
1107 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001108 assert(_PyUnicode_CHECK(unicode));
1109 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001111 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001112 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001113 /* Actually, it should neither be interned nor be anything else: */
1114 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115
1116#ifdef Py_DEBUG
1117 ++unicode_ready_calls;
1118#endif
1119
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001120#ifdef Py_DEBUG
1121 assert(!replace || Py_REFCNT(unicode) == 1);
1122#else
1123 if (replace && Py_REFCNT(unicode) != 1)
1124 replace = 0;
1125#endif
1126 if (replace) {
1127 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1128 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1129 /* Optimization for empty strings */
1130 if (len == 0) {
1131 Py_INCREF(unicode_empty);
1132 Py_DECREF(*p_obj);
1133 *p_obj = unicode_empty;
1134 return 0;
1135 }
1136 if (len == 1 && wstr[0] < 256) {
1137 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1138 if (latin1_char == NULL)
1139 return -1;
1140 Py_DECREF(*p_obj);
1141 *p_obj = latin1_char;
1142 return 0;
1143 }
1144 }
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001147 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001148 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
1151 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001152 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1153 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154 PyErr_NoMemory();
1155 return -1;
1156 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001157 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 _PyUnicode_WSTR(unicode), end,
1159 PyUnicode_1BYTE_DATA(unicode));
1160 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1161 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1162 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1163 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001164 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001166 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 }
1168 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001169 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001170 _PyUnicode_UTF8(unicode) = NULL;
1171 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172 }
1173 PyObject_FREE(_PyUnicode_WSTR(unicode));
1174 _PyUnicode_WSTR(unicode) = NULL;
1175 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1176 }
1177 /* In this case we might have to convert down from 4-byte native
1178 wchar_t to 2-byte unicode. */
1179 else if (maxchar < 65536) {
1180 assert(num_surrogates == 0 &&
1181 "FindMaxCharAndNumSurrogatePairs() messed up");
1182
Victor Stinner506f5922011-09-28 22:34:18 +02001183#if SIZEOF_WCHAR_T == 2
1184 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001185 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001186 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1187 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1188 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001189 _PyUnicode_UTF8(unicode) = NULL;
1190 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001191#else
1192 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001193 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001194 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001195 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001196 PyErr_NoMemory();
1197 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 }
Victor Stinner506f5922011-09-28 22:34:18 +02001199 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1200 _PyUnicode_WSTR(unicode), end,
1201 PyUnicode_2BYTE_DATA(unicode));
1202 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1203 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1204 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001205 _PyUnicode_UTF8(unicode) = NULL;
1206 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001207 PyObject_FREE(_PyUnicode_WSTR(unicode));
1208 _PyUnicode_WSTR(unicode) = NULL;
1209 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1210#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
1212 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1213 else {
1214#if SIZEOF_WCHAR_T == 2
1215 /* in case the native representation is 2-bytes, we need to allocate a
1216 new normalized 4-byte version. */
1217 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001218 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1219 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 PyErr_NoMemory();
1221 return -1;
1222 }
1223 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1224 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001225 _PyUnicode_UTF8(unicode) = NULL;
1226 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001227 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1228 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001229 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 PyObject_FREE(_PyUnicode_WSTR(unicode));
1231 _PyUnicode_WSTR(unicode) = NULL;
1232 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1233#else
1234 assert(num_surrogates == 0);
1235
Victor Stinnerc3c74152011-10-02 20:39:55 +02001236 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1241#endif
1242 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1243 }
1244 _PyUnicode_STATE(unicode).ready = 1;
1245 return 0;
1246}
1247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001248int
1249_PyUnicode_ReadyReplace(PyObject **op)
1250{
1251 return unicode_ready(op, 1);
1252}
1253
1254int
1255_PyUnicode_Ready(PyObject *op)
1256{
1257 return unicode_ready(&op, 0);
1258}
1259
Alexander Belopolsky40018472011-02-26 01:02:56 +00001260static void
1261unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262{
Walter Dörwald16807132007-05-25 13:52:07 +00001263 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001264 case SSTATE_NOT_INTERNED:
1265 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001266
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 case SSTATE_INTERNED_MORTAL:
1268 /* revive dead object temporarily for DelItem */
1269 Py_REFCNT(unicode) = 3;
1270 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1271 Py_FatalError(
1272 "deletion of interned string failed");
1273 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001274
Benjamin Peterson29060642009-01-31 22:14:21 +00001275 case SSTATE_INTERNED_IMMORTAL:
1276 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001277
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 default:
1279 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001280 }
1281
Victor Stinner03490912011-10-03 23:45:12 +02001282 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001284 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001285 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286
1287 if (PyUnicode_IS_COMPACT(unicode)) {
1288 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 }
1290 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001291 if (_PyUnicode_DATA_ANY(unicode))
1292 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001293 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294 }
1295}
1296
Alexander Belopolsky40018472011-02-26 01:02:56 +00001297static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001298unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001299{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001300 if (Py_REFCNT(unicode) != 1)
1301 return 0;
1302 if (PyUnicode_CHECK_INTERNED(unicode))
1303 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001304 assert (unicode != unicode_empty);
1305#ifdef Py_DEBUG
1306 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1307 && PyUnicode_GET_LENGTH(unicode) == 1)
1308 {
1309 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001310 if (ch < 256 && unicode_latin1[ch] == unicode)
1311 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001313#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001314 return 1;
1315}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001316
Victor Stinnerfe226c02011-10-03 03:52:20 +02001317static int
1318unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1319{
1320 PyObject *unicode;
1321 Py_ssize_t old_length;
1322
1323 assert(p_unicode != NULL);
1324 unicode = *p_unicode;
1325
1326 assert(unicode != NULL);
1327 assert(PyUnicode_Check(unicode));
1328 assert(0 <= length);
1329
Victor Stinner910337b2011-10-03 03:20:16 +02001330 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001331 old_length = PyUnicode_WSTR_LENGTH(unicode);
1332 else
1333 old_length = PyUnicode_GET_LENGTH(unicode);
1334 if (old_length == length)
1335 return 0;
1336
Victor Stinnerfe226c02011-10-03 03:52:20 +02001337 if (!unicode_resizable(unicode)) {
1338 PyObject *copy = resize_copy(unicode, length);
1339 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001341 Py_DECREF(*p_unicode);
1342 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001343 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001344 }
1345
Victor Stinnerfe226c02011-10-03 03:52:20 +02001346 if (PyUnicode_IS_COMPACT(unicode)) {
1347 *p_unicode = resize_compact(unicode, length);
1348 if (*p_unicode == NULL)
1349 return -1;
Victor Stinner95663112011-10-04 01:03:50 +02001350 _PyUnicode_CHECK(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001351 return 0;
1352 } else
1353 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001354}
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001357PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001358{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 PyObject *unicode;
1360 if (p_unicode == NULL) {
1361 PyErr_BadInternalCall();
1362 return -1;
1363 }
1364 unicode = *p_unicode;
1365 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1366 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1367 {
1368 PyErr_BadInternalCall();
1369 return -1;
1370 }
1371 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001372}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374static PyObject*
1375get_latin1_char(unsigned char ch)
1376{
Victor Stinnera464fc12011-10-02 20:39:30 +02001377 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001379 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 if (!unicode)
1381 return NULL;
1382 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1383 unicode_latin1[ch] = unicode;
1384 }
1385 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001386 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387}
1388
Alexander Belopolsky40018472011-02-26 01:02:56 +00001389PyObject *
1390PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391{
1392 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 Py_UCS4 maxchar = 0;
1394 Py_ssize_t num_surrogates;
1395
1396 if (u == NULL)
1397 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001399 /* If the Unicode data is known at construction time, we can apply
1400 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 /* Optimization for empty strings */
1403 if (size == 0 && unicode_empty != NULL) {
1404 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001405 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406 }
Tim Petersced69f82003-09-16 20:30:58 +00001407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 /* Single character Unicode objects in the Latin-1 range are
1409 shared when using this constructor */
1410 if (size == 1 && *u < 256)
1411 return get_latin1_char((unsigned char)*u);
1412
1413 /* If not empty and not single character, copy the Unicode data
1414 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 if (find_maxchar_surrogates(u, u + size,
1416 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 return NULL;
1418
1419 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1420 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 if (!unicode)
1422 return NULL;
1423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 switch (PyUnicode_KIND(unicode)) {
1425 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001426 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1428 break;
1429 case PyUnicode_2BYTE_KIND:
1430#if Py_UNICODE_SIZE == 2
1431 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1432#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001433 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1435#endif
1436 break;
1437 case PyUnicode_4BYTE_KIND:
1438#if SIZEOF_WCHAR_T == 2
1439 /* This is the only case which has to process surrogates, thus
1440 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001441 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442#else
1443 assert(num_surrogates == 0);
1444 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1445#endif
1446 break;
1447 default:
1448 assert(0 && "Impossible state");
1449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450
1451 return (PyObject *)unicode;
1452}
1453
Alexander Belopolsky40018472011-02-26 01:02:56 +00001454PyObject *
1455PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001456{
1457 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001458
Benjamin Peterson14339b62009-01-31 16:36:08 +00001459 if (size < 0) {
1460 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001461 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001462 return NULL;
1463 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001464
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001465 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001466 some optimizations which share commonly used objects.
1467 Also, this means the input must be UTF-8, so fall back to the
1468 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469 if (u != NULL) {
1470
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 /* Optimization for empty strings */
1472 if (size == 0 && unicode_empty != NULL) {
1473 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001474 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001476
1477 /* Single characters are shared when using this constructor.
1478 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 if (size == 1 && Py_CHARMASK(*u) < 128)
1480 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001481
1482 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001483 }
1484
Walter Dörwald55507312007-05-18 13:12:10 +00001485 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001486 if (!unicode)
1487 return NULL;
1488
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001489 return (PyObject *)unicode;
1490}
1491
Alexander Belopolsky40018472011-02-26 01:02:56 +00001492PyObject *
1493PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001494{
1495 size_t size = strlen(u);
1496 if (size > PY_SSIZE_T_MAX) {
1497 PyErr_SetString(PyExc_OverflowError, "input too long");
1498 return NULL;
1499 }
1500
1501 return PyUnicode_FromStringAndSize(u, size);
1502}
1503
Victor Stinnere57b1c02011-09-28 22:20:48 +02001504static PyObject*
1505_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 PyObject *res;
1508 unsigned char max = 127;
1509 Py_ssize_t i;
1510 for (i = 0; i < size; i++) {
1511 if (u[i] & 0x80) {
1512 max = 255;
1513 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001514 }
1515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 res = PyUnicode_New(size, max);
1517 if (!res)
1518 return NULL;
1519 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1520 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001521}
1522
Victor Stinnere57b1c02011-09-28 22:20:48 +02001523static PyObject*
1524_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525{
1526 PyObject *res;
1527 Py_UCS2 max = 0;
1528 Py_ssize_t i;
1529 for (i = 0; i < size; i++)
1530 if (u[i] > max)
1531 max = u[i];
1532 res = PyUnicode_New(size, max);
1533 if (!res)
1534 return NULL;
1535 if (max >= 256)
1536 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1537 else
1538 for (i = 0; i < size; i++)
1539 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1540 return res;
1541}
1542
Victor Stinnere57b1c02011-09-28 22:20:48 +02001543static PyObject*
1544_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001545{
1546 PyObject *res;
1547 Py_UCS4 max = 0;
1548 Py_ssize_t i;
1549 for (i = 0; i < size; i++)
1550 if (u[i] > max)
1551 max = u[i];
1552 res = PyUnicode_New(size, max);
1553 if (!res)
1554 return NULL;
1555 if (max >= 0x10000)
1556 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1557 else {
1558 int kind = PyUnicode_KIND(res);
1559 void *data = PyUnicode_DATA(res);
1560 for (i = 0; i < size; i++)
1561 PyUnicode_WRITE(kind, data, i, u[i]);
1562 }
1563 return res;
1564}
1565
1566PyObject*
1567PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1568{
1569 switch(kind) {
1570 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001571 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001573 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001575 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 }
Victor Stinner01698042011-10-04 00:04:26 +02001577 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 return NULL;
1579}
1580
Victor Stinner034f6cf2011-09-30 02:26:44 +02001581PyObject*
1582PyUnicode_Copy(PyObject *unicode)
1583{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001584 Py_ssize_t size;
1585 PyObject *copy;
1586 void *data;
1587
Victor Stinner034f6cf2011-09-30 02:26:44 +02001588 if (!PyUnicode_Check(unicode)) {
1589 PyErr_BadInternalCall();
1590 return NULL;
1591 }
1592 if (PyUnicode_READY(unicode))
1593 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001594
1595 size = PyUnicode_GET_LENGTH(unicode);
1596 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1597 if (!copy)
1598 return NULL;
1599 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1600
1601 data = PyUnicode_DATA(unicode);
1602 switch (PyUnicode_KIND(unicode))
1603 {
1604 case PyUnicode_1BYTE_KIND:
1605 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1606 break;
1607 case PyUnicode_2BYTE_KIND:
1608 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1609 break;
1610 case PyUnicode_4BYTE_KIND:
1611 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1612 break;
1613 default:
1614 assert(0);
1615 break;
1616 }
1617 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001618}
1619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620
Victor Stinnerbc603d12011-10-02 01:00:40 +02001621/* Widen Unicode objects to larger buffers. Don't write terminating null
1622 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623
1624void*
1625_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1626{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001627 Py_ssize_t len;
1628 void *result;
1629 unsigned int skind;
1630
1631 if (PyUnicode_READY(s))
1632 return NULL;
1633
1634 len = PyUnicode_GET_LENGTH(s);
1635 skind = PyUnicode_KIND(s);
1636 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001637 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 return NULL;
1639 }
1640 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001641 case PyUnicode_2BYTE_KIND:
1642 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1643 if (!result)
1644 return PyErr_NoMemory();
1645 assert(skind == PyUnicode_1BYTE_KIND);
1646 _PyUnicode_CONVERT_BYTES(
1647 Py_UCS1, Py_UCS2,
1648 PyUnicode_1BYTE_DATA(s),
1649 PyUnicode_1BYTE_DATA(s) + len,
1650 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001652 case PyUnicode_4BYTE_KIND:
1653 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1654 if (!result)
1655 return PyErr_NoMemory();
1656 if (skind == PyUnicode_2BYTE_KIND) {
1657 _PyUnicode_CONVERT_BYTES(
1658 Py_UCS2, Py_UCS4,
1659 PyUnicode_2BYTE_DATA(s),
1660 PyUnicode_2BYTE_DATA(s) + len,
1661 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001663 else {
1664 assert(skind == PyUnicode_1BYTE_KIND);
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS1, Py_UCS4,
1667 PyUnicode_1BYTE_DATA(s),
1668 PyUnicode_1BYTE_DATA(s) + len,
1669 result);
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001672 default:
1673 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
Victor Stinner01698042011-10-04 00:04:26 +02001675 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return NULL;
1677}
1678
1679static Py_UCS4*
1680as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1681 int copy_null)
1682{
1683 int kind;
1684 void *data;
1685 Py_ssize_t len, targetlen;
1686 if (PyUnicode_READY(string) == -1)
1687 return NULL;
1688 kind = PyUnicode_KIND(string);
1689 data = PyUnicode_DATA(string);
1690 len = PyUnicode_GET_LENGTH(string);
1691 targetlen = len;
1692 if (copy_null)
1693 targetlen++;
1694 if (!target) {
1695 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1696 PyErr_NoMemory();
1697 return NULL;
1698 }
1699 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1700 if (!target) {
1701 PyErr_NoMemory();
1702 return NULL;
1703 }
1704 }
1705 else {
1706 if (targetsize < targetlen) {
1707 PyErr_Format(PyExc_SystemError,
1708 "string is longer than the buffer");
1709 if (copy_null && 0 < targetsize)
1710 target[0] = 0;
1711 return NULL;
1712 }
1713 }
1714 if (kind != PyUnicode_4BYTE_KIND) {
1715 Py_ssize_t i;
1716 for (i = 0; i < len; i++)
1717 target[i] = PyUnicode_READ(kind, data, i);
1718 }
1719 else
1720 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1721 if (copy_null)
1722 target[len] = 0;
1723 return target;
1724}
1725
1726Py_UCS4*
1727PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1728 int copy_null)
1729{
1730 if (target == NULL || targetsize < 1) {
1731 PyErr_BadInternalCall();
1732 return NULL;
1733 }
1734 return as_ucs4(string, target, targetsize, copy_null);
1735}
1736
1737Py_UCS4*
1738PyUnicode_AsUCS4Copy(PyObject *string)
1739{
1740 return as_ucs4(string, NULL, 0, 1);
1741}
1742
1743#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001744
Alexander Belopolsky40018472011-02-26 01:02:56 +00001745PyObject *
1746PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001749 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001751 PyErr_BadInternalCall();
1752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 }
1754
Martin v. Löwis790465f2008-04-05 20:41:37 +00001755 if (size == -1) {
1756 size = wcslen(w);
1757 }
1758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760}
1761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001763
Walter Dörwald346737f2007-05-31 10:44:43 +00001764static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001765makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1766 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001767{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 *fmt++ = '%';
1769 if (width) {
1770 if (zeropad)
1771 *fmt++ = '0';
1772 fmt += sprintf(fmt, "%d", width);
1773 }
1774 if (precision)
1775 fmt += sprintf(fmt, ".%d", precision);
1776 if (longflag)
1777 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001778 else if (longlongflag) {
1779 /* longlongflag should only ever be nonzero on machines with
1780 HAVE_LONG_LONG defined */
1781#ifdef HAVE_LONG_LONG
1782 char *f = PY_FORMAT_LONG_LONG;
1783 while (*f)
1784 *fmt++ = *f++;
1785#else
1786 /* we shouldn't ever get here */
1787 assert(0);
1788 *fmt++ = 'l';
1789#endif
1790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 else if (size_tflag) {
1792 char *f = PY_FORMAT_SIZE_T;
1793 while (*f)
1794 *fmt++ = *f++;
1795 }
1796 *fmt++ = c;
1797 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001798}
1799
Victor Stinner96865452011-03-01 23:44:09 +00001800/* helper for PyUnicode_FromFormatV() */
1801
1802static const char*
1803parse_format_flags(const char *f,
1804 int *p_width, int *p_precision,
1805 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1806{
1807 int width, precision, longflag, longlongflag, size_tflag;
1808
1809 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1810 f++;
1811 width = 0;
1812 while (Py_ISDIGIT((unsigned)*f))
1813 width = (width*10) + *f++ - '0';
1814 precision = 0;
1815 if (*f == '.') {
1816 f++;
1817 while (Py_ISDIGIT((unsigned)*f))
1818 precision = (precision*10) + *f++ - '0';
1819 if (*f == '%') {
1820 /* "%.3%s" => f points to "3" */
1821 f--;
1822 }
1823 }
1824 if (*f == '\0') {
1825 /* bogus format "%.1" => go backward, f points to "1" */
1826 f--;
1827 }
1828 if (p_width != NULL)
1829 *p_width = width;
1830 if (p_precision != NULL)
1831 *p_precision = precision;
1832
1833 /* Handle %ld, %lu, %lld and %llu. */
1834 longflag = 0;
1835 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001836 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001837
1838 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001839 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001840 longflag = 1;
1841 ++f;
1842 }
1843#ifdef HAVE_LONG_LONG
1844 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001845 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001846 longlongflag = 1;
1847 f += 2;
1848 }
1849#endif
1850 }
1851 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001853 size_tflag = 1;
1854 ++f;
1855 }
1856 if (p_longflag != NULL)
1857 *p_longflag = longflag;
1858 if (p_longlongflag != NULL)
1859 *p_longlongflag = longlongflag;
1860 if (p_size_tflag != NULL)
1861 *p_size_tflag = size_tflag;
1862 return f;
1863}
1864
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001865/* maximum number of characters required for output of %ld. 21 characters
1866 allows for 64-bit integers (in decimal) and an optional sign. */
1867#define MAX_LONG_CHARS 21
1868/* maximum number of characters required for output of %lld.
1869 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1870 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1871#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1872
Walter Dörwaldd2034312007-05-18 16:29:38 +00001873PyObject *
1874PyUnicode_FromFormatV(const char *format, va_list vargs)
1875{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001876 va_list count;
1877 Py_ssize_t callcount = 0;
1878 PyObject **callresults = NULL;
1879 PyObject **callresult = NULL;
1880 Py_ssize_t n = 0;
1881 int width = 0;
1882 int precision = 0;
1883 int zeropad;
1884 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001887 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1889 Py_UCS4 argmaxchar;
1890 Py_ssize_t numbersize = 0;
1891 char *numberresults = NULL;
1892 char *numberresult = NULL;
1893 Py_ssize_t i;
1894 int kind;
1895 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001896
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001897 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001898 /* step 1: count the number of %S/%R/%A/%s format specifications
1899 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1900 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 * result in an array)
1902 * also esimate a upper bound for all the number formats in the string,
1903 * numbers will be formated in step 3 and be keept in a '\0'-separated
1904 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001905 for (f = format; *f; f++) {
1906 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001907 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1909 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1910 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1911 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001914#ifdef HAVE_LONG_LONG
1915 if (longlongflag) {
1916 if (width < MAX_LONG_LONG_CHARS)
1917 width = MAX_LONG_LONG_CHARS;
1918 }
1919 else
1920#endif
1921 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1922 including sign. Decimal takes the most space. This
1923 isn't enough for octal. If a width is specified we
1924 need more (which we allocate later). */
1925 if (width < MAX_LONG_CHARS)
1926 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927
1928 /* account for the size + '\0' to separate numbers
1929 inside of the numberresults buffer */
1930 numbersize += (width + 1);
1931 }
1932 }
1933 else if ((unsigned char)*f > 127) {
1934 PyErr_Format(PyExc_ValueError,
1935 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1936 "string, got a non-ASCII byte: 0x%02x",
1937 (unsigned char)*f);
1938 return NULL;
1939 }
1940 }
1941 /* step 2: allocate memory for the results of
1942 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1943 if (callcount) {
1944 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1945 if (!callresults) {
1946 PyErr_NoMemory();
1947 return NULL;
1948 }
1949 callresult = callresults;
1950 }
1951 /* step 2.5: allocate memory for the results of formating numbers */
1952 if (numbersize) {
1953 numberresults = PyObject_Malloc(numbersize);
1954 if (!numberresults) {
1955 PyErr_NoMemory();
1956 goto fail;
1957 }
1958 numberresult = numberresults;
1959 }
1960
1961 /* step 3: format numbers and figure out how large a buffer we need */
1962 for (f = format; *f; f++) {
1963 if (*f == '%') {
1964 const char* p;
1965 int longflag;
1966 int longlongflag;
1967 int size_tflag;
1968 int numprinted;
1969
1970 p = f;
1971 zeropad = (f[1] == '0');
1972 f = parse_format_flags(f, &width, &precision,
1973 &longflag, &longlongflag, &size_tflag);
1974 switch (*f) {
1975 case 'c':
1976 {
1977 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001978 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 n++;
1980 break;
1981 }
1982 case '%':
1983 n++;
1984 break;
1985 case 'i':
1986 case 'd':
1987 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1988 width, precision, *f);
1989 if (longflag)
1990 numprinted = sprintf(numberresult, fmt,
1991 va_arg(count, long));
1992#ifdef HAVE_LONG_LONG
1993 else if (longlongflag)
1994 numprinted = sprintf(numberresult, fmt,
1995 va_arg(count, PY_LONG_LONG));
1996#endif
1997 else if (size_tflag)
1998 numprinted = sprintf(numberresult, fmt,
1999 va_arg(count, Py_ssize_t));
2000 else
2001 numprinted = sprintf(numberresult, fmt,
2002 va_arg(count, int));
2003 n += numprinted;
2004 /* advance by +1 to skip over the '\0' */
2005 numberresult += (numprinted + 1);
2006 assert(*(numberresult - 1) == '\0');
2007 assert(*(numberresult - 2) != '\0');
2008 assert(numprinted >= 0);
2009 assert(numberresult <= numberresults + numbersize);
2010 break;
2011 case 'u':
2012 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2013 width, precision, 'u');
2014 if (longflag)
2015 numprinted = sprintf(numberresult, fmt,
2016 va_arg(count, unsigned long));
2017#ifdef HAVE_LONG_LONG
2018 else if (longlongflag)
2019 numprinted = sprintf(numberresult, fmt,
2020 va_arg(count, unsigned PY_LONG_LONG));
2021#endif
2022 else if (size_tflag)
2023 numprinted = sprintf(numberresult, fmt,
2024 va_arg(count, size_t));
2025 else
2026 numprinted = sprintf(numberresult, fmt,
2027 va_arg(count, unsigned int));
2028 n += numprinted;
2029 numberresult += (numprinted + 1);
2030 assert(*(numberresult - 1) == '\0');
2031 assert(*(numberresult - 2) != '\0');
2032 assert(numprinted >= 0);
2033 assert(numberresult <= numberresults + numbersize);
2034 break;
2035 case 'x':
2036 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2037 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2038 n += numprinted;
2039 numberresult += (numprinted + 1);
2040 assert(*(numberresult - 1) == '\0');
2041 assert(*(numberresult - 2) != '\0');
2042 assert(numprinted >= 0);
2043 assert(numberresult <= numberresults + numbersize);
2044 break;
2045 case 'p':
2046 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2047 /* %p is ill-defined: ensure leading 0x. */
2048 if (numberresult[1] == 'X')
2049 numberresult[1] = 'x';
2050 else if (numberresult[1] != 'x') {
2051 memmove(numberresult + 2, numberresult,
2052 strlen(numberresult) + 1);
2053 numberresult[0] = '0';
2054 numberresult[1] = 'x';
2055 numprinted += 2;
2056 }
2057 n += numprinted;
2058 numberresult += (numprinted + 1);
2059 assert(*(numberresult - 1) == '\0');
2060 assert(*(numberresult - 2) != '\0');
2061 assert(numprinted >= 0);
2062 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002063 break;
2064 case 's':
2065 {
2066 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002067 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002068 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2069 if (!str)
2070 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 /* since PyUnicode_DecodeUTF8 returns already flexible
2072 unicode objects, there is no need to call ready on them */
2073 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002074 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002076 /* Remember the str and switch to the next slot */
2077 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002078 break;
2079 }
2080 case 'U':
2081 {
2082 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002083 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 if (PyUnicode_READY(obj) == -1)
2085 goto fail;
2086 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002087 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 break;
2090 }
2091 case 'V':
2092 {
2093 PyObject *obj = va_arg(count, PyObject *);
2094 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002095 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002096 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002097 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002098 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 if (PyUnicode_READY(obj) == -1)
2100 goto fail;
2101 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002102 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002104 *callresult++ = NULL;
2105 }
2106 else {
2107 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2108 if (!str_obj)
2109 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002111 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002113 *callresult++ = str_obj;
2114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 break;
2116 }
2117 case 'S':
2118 {
2119 PyObject *obj = va_arg(count, PyObject *);
2120 PyObject *str;
2121 assert(obj);
2122 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002124 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002126 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 /* Remember the str and switch to the next slot */
2129 *callresult++ = str;
2130 break;
2131 }
2132 case 'R':
2133 {
2134 PyObject *obj = va_arg(count, PyObject *);
2135 PyObject *repr;
2136 assert(obj);
2137 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002139 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002141 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002143 /* Remember the repr and switch to the next slot */
2144 *callresult++ = repr;
2145 break;
2146 }
2147 case 'A':
2148 {
2149 PyObject *obj = va_arg(count, PyObject *);
2150 PyObject *ascii;
2151 assert(obj);
2152 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002156 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002158 /* Remember the repr and switch to the next slot */
2159 *callresult++ = ascii;
2160 break;
2161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002162 default:
2163 /* if we stumble upon an unknown
2164 formatting code, copy the rest of
2165 the format string to the output
2166 string. (we cannot just skip the
2167 code, since there's no way to know
2168 what's in the argument list) */
2169 n += strlen(p);
2170 goto expand;
2171 }
2172 } else
2173 n++;
2174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002176 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 we don't have to resize the string.
2179 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 if (!string)
2182 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 kind = PyUnicode_KIND(string);
2184 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002190 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002191
2192 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2194 /* checking for == because the last argument could be a empty
2195 string, which causes i to point to end, the assert at the end of
2196 the loop */
2197 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002198
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 switch (*f) {
2200 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002201 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 const int ordinal = va_arg(vargs, int);
2203 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002204 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002205 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002206 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002209 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 case 'p':
2211 /* unused, since we already have the result */
2212 if (*f == 'p')
2213 (void) va_arg(vargs, void *);
2214 else
2215 (void) va_arg(vargs, int);
2216 /* extract the result from numberresults and append. */
2217 for (; *numberresult; ++i, ++numberresult)
2218 PyUnicode_WRITE(kind, data, i, *numberresult);
2219 /* skip over the separating '\0' */
2220 assert(*numberresult == '\0');
2221 numberresult++;
2222 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002223 break;
2224 case 's':
2225 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002226 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002228 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 size = PyUnicode_GET_LENGTH(*callresult);
2230 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002231 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2232 *callresult, 0,
2233 size) < 0)
2234 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002236 /* We're done with the unicode()/repr() => forget it */
2237 Py_DECREF(*callresult);
2238 /* switch to next unicode()/repr() result */
2239 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 break;
2241 }
2242 case 'U':
2243 {
2244 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 Py_ssize_t size;
2246 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2247 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002248 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2249 obj, 0,
2250 size) < 0)
2251 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 break;
2254 }
2255 case 'V':
2256 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002258 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002259 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 size = PyUnicode_GET_LENGTH(obj);
2262 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002263 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2264 obj, 0,
2265 size) < 0)
2266 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002268 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 size = PyUnicode_GET_LENGTH(*callresult);
2270 assert(PyUnicode_KIND(*callresult) <=
2271 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002272 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2273 *callresult,
2274 0, size) < 0)
2275 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002277 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002279 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 break;
2281 }
2282 case 'S':
2283 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002284 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002286 /* unused, since we already have the result */
2287 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002289 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2290 *callresult, 0,
2291 PyUnicode_GET_LENGTH(*callresult)) < 0)
2292 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 /* We're done with the unicode()/repr() => forget it */
2295 Py_DECREF(*callresult);
2296 /* switch to next unicode()/repr() result */
2297 ++callresult;
2298 break;
2299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002302 break;
2303 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 for (; *p; ++p, ++i)
2305 PyUnicode_WRITE(kind, data, i, *p);
2306 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 goto end;
2308 }
Victor Stinner1205f272010-09-11 00:54:47 +00002309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 else {
2311 assert(i < PyUnicode_GET_LENGTH(string));
2312 PyUnicode_WRITE(kind, data, i++, *f);
2313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002316
Benjamin Peterson29060642009-01-31 22:14:21 +00002317 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 if (callresults)
2319 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 if (numberresults)
2321 PyObject_Free(numberresults);
2322 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002323 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 if (callresults) {
2325 PyObject **callresult2 = callresults;
2326 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002327 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002328 ++callresult2;
2329 }
2330 PyObject_Free(callresults);
2331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 if (numberresults)
2333 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002335}
2336
Walter Dörwaldd2034312007-05-18 16:29:38 +00002337PyObject *
2338PyUnicode_FromFormat(const char *format, ...)
2339{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 PyObject* ret;
2341 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002342
2343#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002344 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002345#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002346 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002347#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002348 ret = PyUnicode_FromFormatV(format, vargs);
2349 va_end(vargs);
2350 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002351}
2352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002353#ifdef HAVE_WCHAR_H
2354
Victor Stinner5593d8a2010-10-02 11:11:27 +00002355/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2356 convert a Unicode object to a wide character string.
2357
Victor Stinnerd88d9832011-09-06 02:00:05 +02002358 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002359 character) required to convert the unicode object. Ignore size argument.
2360
Victor Stinnerd88d9832011-09-06 02:00:05 +02002361 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002362 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002363 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002364static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002365unicode_aswidechar(PyUnicodeObject *unicode,
2366 wchar_t *w,
2367 Py_ssize_t size)
2368{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002369 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370 const wchar_t *wstr;
2371
2372 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2373 if (wstr == NULL)
2374 return -1;
2375
Victor Stinner5593d8a2010-10-02 11:11:27 +00002376 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002377 if (size > res)
2378 size = res + 1;
2379 else
2380 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002382 return res;
2383 }
2384 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002386}
2387
2388Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002389PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002390 wchar_t *w,
2391 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392{
2393 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002394 PyErr_BadInternalCall();
2395 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002397 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398}
2399
Victor Stinner137c34c2010-09-29 10:25:54 +00002400wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002401PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002402 Py_ssize_t *size)
2403{
2404 wchar_t* buffer;
2405 Py_ssize_t buflen;
2406
2407 if (unicode == NULL) {
2408 PyErr_BadInternalCall();
2409 return NULL;
2410 }
2411
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002412 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 if (buflen == -1)
2414 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002415 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002416 PyErr_NoMemory();
2417 return NULL;
2418 }
2419
Victor Stinner137c34c2010-09-29 10:25:54 +00002420 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2421 if (buffer == NULL) {
2422 PyErr_NoMemory();
2423 return NULL;
2424 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002425 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (buflen == -1)
2427 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002428 if (size != NULL)
2429 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002430 return buffer;
2431}
2432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434
Alexander Belopolsky40018472011-02-26 01:02:56 +00002435PyObject *
2436PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002439 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 PyErr_SetString(PyExc_ValueError,
2441 "chr() arg not in range(0x110000)");
2442 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002443 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (ordinal < 256)
2446 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 v = PyUnicode_New(1, ordinal);
2449 if (v == NULL)
2450 return NULL;
2451 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2452 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002453}
2454
Alexander Belopolsky40018472011-02-26 01:02:56 +00002455PyObject *
2456PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002458 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002460 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002461 if (PyUnicode_READY(obj))
2462 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 Py_INCREF(obj);
2464 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002465 }
2466 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 /* For a Unicode subtype that's not a Unicode object,
2468 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002469 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002470 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002471 PyErr_Format(PyExc_TypeError,
2472 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002473 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002474 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002475}
2476
Alexander Belopolsky40018472011-02-26 01:02:56 +00002477PyObject *
2478PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002479 const char *encoding,
2480 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002481{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002482 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002483 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002484
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 PyErr_BadInternalCall();
2487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002489
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002490 /* Decoding bytes objects is the most common case and should be fast */
2491 if (PyBytes_Check(obj)) {
2492 if (PyBytes_GET_SIZE(obj) == 0) {
2493 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002494 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002495 }
2496 else {
2497 v = PyUnicode_Decode(
2498 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2499 encoding, errors);
2500 }
2501 return v;
2502 }
2503
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002504 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 PyErr_SetString(PyExc_TypeError,
2506 "decoding str is not supported");
2507 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002509
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002510 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2511 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2512 PyErr_Format(PyExc_TypeError,
2513 "coercing to str: need bytes, bytearray "
2514 "or buffer-like object, %.80s found",
2515 Py_TYPE(obj)->tp_name);
2516 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002517 }
Tim Petersced69f82003-09-16 20:30:58 +00002518
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002519 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002520 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002521 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 }
Tim Petersced69f82003-09-16 20:30:58 +00002523 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002524 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002525
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002526 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002527 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528}
2529
Victor Stinner600d3be2010-06-10 12:00:55 +00002530/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002531 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2532 1 on success. */
2533static int
2534normalize_encoding(const char *encoding,
2535 char *lower,
2536 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002538 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002539 char *l;
2540 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002542 e = encoding;
2543 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002544 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002545 while (*e) {
2546 if (l == l_end)
2547 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002548 if (Py_ISUPPER(*e)) {
2549 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002550 }
2551 else if (*e == '_') {
2552 *l++ = '-';
2553 e++;
2554 }
2555 else {
2556 *l++ = *e++;
2557 }
2558 }
2559 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002560 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002561}
2562
Alexander Belopolsky40018472011-02-26 01:02:56 +00002563PyObject *
2564PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002565 Py_ssize_t size,
2566 const char *encoding,
2567 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002568{
2569 PyObject *buffer = NULL, *unicode;
2570 Py_buffer info;
2571 char lower[11]; /* Enough for any encoding shortcut */
2572
2573 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002574 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002575
2576 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002577 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002578 if ((strcmp(lower, "utf-8") == 0) ||
2579 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002580 return PyUnicode_DecodeUTF8(s, size, errors);
2581 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002582 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002583 (strcmp(lower, "iso-8859-1") == 0))
2584 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002585#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002586 else if (strcmp(lower, "mbcs") == 0)
2587 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002588#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002589 else if (strcmp(lower, "ascii") == 0)
2590 return PyUnicode_DecodeASCII(s, size, errors);
2591 else if (strcmp(lower, "utf-16") == 0)
2592 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2593 else if (strcmp(lower, "utf-32") == 0)
2594 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
2597 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002598 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002599 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002600 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002601 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 if (buffer == NULL)
2603 goto onError;
2604 unicode = PyCodec_Decode(buffer, encoding, errors);
2605 if (unicode == NULL)
2606 goto onError;
2607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002609 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002610 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 Py_DECREF(unicode);
2612 goto onError;
2613 }
2614 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002615 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 Py_DECREF(unicode);
2617 return NULL;
2618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002620
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 Py_XDECREF(buffer);
2623 return NULL;
2624}
2625
Alexander Belopolsky40018472011-02-26 01:02:56 +00002626PyObject *
2627PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002628 const char *encoding,
2629 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002630{
2631 PyObject *v;
2632
2633 if (!PyUnicode_Check(unicode)) {
2634 PyErr_BadArgument();
2635 goto onError;
2636 }
2637
2638 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002639 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002640
2641 /* Decode via the codec registry */
2642 v = PyCodec_Decode(unicode, encoding, errors);
2643 if (v == NULL)
2644 goto onError;
2645 return v;
2646
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002648 return NULL;
2649}
2650
Alexander Belopolsky40018472011-02-26 01:02:56 +00002651PyObject *
2652PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002653 const char *encoding,
2654 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002655{
2656 PyObject *v;
2657
2658 if (!PyUnicode_Check(unicode)) {
2659 PyErr_BadArgument();
2660 goto onError;
2661 }
2662
2663 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002665
2666 /* Decode via the codec registry */
2667 v = PyCodec_Decode(unicode, encoding, errors);
2668 if (v == NULL)
2669 goto onError;
2670 if (!PyUnicode_Check(v)) {
2671 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002672 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002673 Py_TYPE(v)->tp_name);
2674 Py_DECREF(v);
2675 goto onError;
2676 }
2677 return v;
2678
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002680 return NULL;
2681}
2682
Alexander Belopolsky40018472011-02-26 01:02:56 +00002683PyObject *
2684PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002685 Py_ssize_t size,
2686 const char *encoding,
2687 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688{
2689 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002690
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 unicode = PyUnicode_FromUnicode(s, size);
2692 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2695 Py_DECREF(unicode);
2696 return v;
2697}
2698
Alexander Belopolsky40018472011-02-26 01:02:56 +00002699PyObject *
2700PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002701 const char *encoding,
2702 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002703{
2704 PyObject *v;
2705
2706 if (!PyUnicode_Check(unicode)) {
2707 PyErr_BadArgument();
2708 goto onError;
2709 }
2710
2711 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002713
2714 /* Encode via the codec registry */
2715 v = PyCodec_Encode(unicode, encoding, errors);
2716 if (v == NULL)
2717 goto onError;
2718 return v;
2719
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002721 return NULL;
2722}
2723
Victor Stinnerad158722010-10-27 00:25:46 +00002724PyObject *
2725PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002726{
Victor Stinner99b95382011-07-04 14:23:54 +02002727#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002728 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2729 PyUnicode_GET_SIZE(unicode),
2730 NULL);
2731#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002733#else
Victor Stinner793b5312011-04-27 00:24:21 +02002734 PyInterpreterState *interp = PyThreadState_GET()->interp;
2735 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2736 cannot use it to encode and decode filenames before it is loaded. Load
2737 the Python codec requires to encode at least its own filename. Use the C
2738 version of the locale codec until the codec registry is initialized and
2739 the Python codec is loaded.
2740
2741 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2742 cannot only rely on it: check also interp->fscodec_initialized for
2743 subinterpreters. */
2744 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002745 return PyUnicode_AsEncodedString(unicode,
2746 Py_FileSystemDefaultEncoding,
2747 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002748 }
2749 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002750 /* locale encoding with surrogateescape */
2751 wchar_t *wchar;
2752 char *bytes;
2753 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002754 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002755
2756 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2757 if (wchar == NULL)
2758 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002759 bytes = _Py_wchar2char(wchar, &error_pos);
2760 if (bytes == NULL) {
2761 if (error_pos != (size_t)-1) {
2762 char *errmsg = strerror(errno);
2763 PyObject *exc = NULL;
2764 if (errmsg == NULL)
2765 errmsg = "Py_wchar2char() failed";
2766 raise_encode_exception(&exc,
2767 "filesystemencoding",
2768 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2769 error_pos, error_pos+1,
2770 errmsg);
2771 Py_XDECREF(exc);
2772 }
2773 else
2774 PyErr_NoMemory();
2775 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002776 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002777 }
2778 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002779
2780 bytes_obj = PyBytes_FromString(bytes);
2781 PyMem_Free(bytes);
2782 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002783 }
Victor Stinnerad158722010-10-27 00:25:46 +00002784#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002785}
2786
Alexander Belopolsky40018472011-02-26 01:02:56 +00002787PyObject *
2788PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002789 const char *encoding,
2790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791{
2792 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002793 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 if (!PyUnicode_Check(unicode)) {
2796 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Fred Drakee4315f52000-05-09 19:53:39 +00002799
Victor Stinner2f283c22011-03-02 01:21:46 +00002800 if (encoding == NULL) {
2801 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002803 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002805 }
Fred Drakee4315f52000-05-09 19:53:39 +00002806
2807 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002808 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002809 if ((strcmp(lower, "utf-8") == 0) ||
2810 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002811 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002812 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002814 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002816 }
Victor Stinner37296e82010-06-10 13:36:23 +00002817 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002818 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002819 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002821#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002822 else if (strcmp(lower, "mbcs") == 0)
2823 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2824 PyUnicode_GET_SIZE(unicode),
2825 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002826#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002827 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830
2831 /* Encode via the codec registry */
2832 v = PyCodec_Encode(unicode, encoding, errors);
2833 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002834 return NULL;
2835
2836 /* The normal path */
2837 if (PyBytes_Check(v))
2838 return v;
2839
2840 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002841 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002842 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002843 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002844
2845 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2846 "encoder %s returned bytearray instead of bytes",
2847 encoding);
2848 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002849 Py_DECREF(v);
2850 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002851 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002852
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002853 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2854 Py_DECREF(v);
2855 return b;
2856 }
2857
2858 PyErr_Format(PyExc_TypeError,
2859 "encoder did not return a bytes object (type=%.400s)",
2860 Py_TYPE(v)->tp_name);
2861 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002862 return NULL;
2863}
2864
Alexander Belopolsky40018472011-02-26 01:02:56 +00002865PyObject *
2866PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002867 const char *encoding,
2868 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002869{
2870 PyObject *v;
2871
2872 if (!PyUnicode_Check(unicode)) {
2873 PyErr_BadArgument();
2874 goto onError;
2875 }
2876
2877 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002879
2880 /* Encode via the codec registry */
2881 v = PyCodec_Encode(unicode, encoding, errors);
2882 if (v == NULL)
2883 goto onError;
2884 if (!PyUnicode_Check(v)) {
2885 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002886 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002887 Py_TYPE(v)->tp_name);
2888 Py_DECREF(v);
2889 goto onError;
2890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002892
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 return NULL;
2895}
2896
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002897PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002898PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002899 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002900 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2901}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002902
Christian Heimes5894ba72007-11-04 11:43:14 +00002903PyObject*
2904PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2905{
Victor Stinner99b95382011-07-04 14:23:54 +02002906#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002907 return PyUnicode_DecodeMBCS(s, size, NULL);
2908#elif defined(__APPLE__)
2909 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2910#else
Victor Stinner793b5312011-04-27 00:24:21 +02002911 PyInterpreterState *interp = PyThreadState_GET()->interp;
2912 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2913 cannot use it to encode and decode filenames before it is loaded. Load
2914 the Python codec requires to encode at least its own filename. Use the C
2915 version of the locale codec until the codec registry is initialized and
2916 the Python codec is loaded.
2917
2918 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2919 cannot only rely on it: check also interp->fscodec_initialized for
2920 subinterpreters. */
2921 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002922 return PyUnicode_Decode(s, size,
2923 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002924 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002925 }
2926 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002927 /* locale encoding with surrogateescape */
2928 wchar_t *wchar;
2929 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002930 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002931
2932 if (s[size] != '\0' || size != strlen(s)) {
2933 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2934 return NULL;
2935 }
2936
Victor Stinner168e1172010-10-16 23:16:16 +00002937 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002938 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002939 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002940
Victor Stinner168e1172010-10-16 23:16:16 +00002941 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002942 PyMem_Free(wchar);
2943 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002944 }
Victor Stinnerad158722010-10-27 00:25:46 +00002945#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002946}
2947
Martin v. Löwis011e8422009-05-05 04:43:17 +00002948
2949int
2950PyUnicode_FSConverter(PyObject* arg, void* addr)
2951{
2952 PyObject *output = NULL;
2953 Py_ssize_t size;
2954 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002955 if (arg == NULL) {
2956 Py_DECREF(*(PyObject**)addr);
2957 return 1;
2958 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002959 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002960 output = arg;
2961 Py_INCREF(output);
2962 }
2963 else {
2964 arg = PyUnicode_FromObject(arg);
2965 if (!arg)
2966 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002967 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002968 Py_DECREF(arg);
2969 if (!output)
2970 return 0;
2971 if (!PyBytes_Check(output)) {
2972 Py_DECREF(output);
2973 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2974 return 0;
2975 }
2976 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002977 size = PyBytes_GET_SIZE(output);
2978 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002979 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002980 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002981 Py_DECREF(output);
2982 return 0;
2983 }
2984 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002985 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002986}
2987
2988
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002989int
2990PyUnicode_FSDecoder(PyObject* arg, void* addr)
2991{
2992 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002993 if (arg == NULL) {
2994 Py_DECREF(*(PyObject**)addr);
2995 return 1;
2996 }
2997 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 if (PyUnicode_READY(arg))
2999 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003000 output = arg;
3001 Py_INCREF(output);
3002 }
3003 else {
3004 arg = PyBytes_FromObject(arg);
3005 if (!arg)
3006 return 0;
3007 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3008 PyBytes_GET_SIZE(arg));
3009 Py_DECREF(arg);
3010 if (!output)
3011 return 0;
3012 if (!PyUnicode_Check(output)) {
3013 Py_DECREF(output);
3014 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3015 return 0;
3016 }
3017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003018 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3019 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003020 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3021 Py_DECREF(output);
3022 return 0;
3023 }
3024 *(PyObject**)addr = output;
3025 return Py_CLEANUP_SUPPORTED;
3026}
3027
3028
Martin v. Löwis5b222132007-06-10 09:51:05 +00003029char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003030PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003031{
Christian Heimesf3863112007-11-22 07:46:41 +00003032 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3034
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003035 if (!PyUnicode_Check(unicode)) {
3036 PyErr_BadArgument();
3037 return NULL;
3038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003039 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003041
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003042 if (PyUnicode_UTF8(unicode) == NULL) {
3043 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003044 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3045 if (bytes == NULL)
3046 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003047 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3048 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003049 Py_DECREF(bytes);
3050 return NULL;
3051 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003052 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3053 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054 Py_DECREF(bytes);
3055 }
3056
3057 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003058 *psize = PyUnicode_UTF8_LENGTH(unicode);
3059 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003060}
3061
3062char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003065 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3066}
3067
3068#ifdef Py_DEBUG
3069int unicode_as_unicode_calls = 0;
3070#endif
3071
3072
3073Py_UNICODE *
3074PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3075{
3076 PyUnicodeObject *u;
3077 const unsigned char *one_byte;
3078#if SIZEOF_WCHAR_T == 4
3079 const Py_UCS2 *two_bytes;
3080#else
3081 const Py_UCS4 *four_bytes;
3082 const Py_UCS4 *ucs4_end;
3083 Py_ssize_t num_surrogates;
3084#endif
3085 wchar_t *w;
3086 wchar_t *wchar_end;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 return NULL;
3091 }
3092 u = (PyUnicodeObject*)unicode;
3093 if (_PyUnicode_WSTR(u) == NULL) {
3094 /* Non-ASCII compact unicode object */
3095 assert(_PyUnicode_KIND(u) != 0);
3096 assert(PyUnicode_IS_READY(u));
3097
3098#ifdef Py_DEBUG
3099 ++unicode_as_unicode_calls;
3100#endif
3101
3102 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3103#if SIZEOF_WCHAR_T == 2
3104 four_bytes = PyUnicode_4BYTE_DATA(u);
3105 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3106 num_surrogates = 0;
3107
3108 for (; four_bytes < ucs4_end; ++four_bytes) {
3109 if (*four_bytes > 0xFFFF)
3110 ++num_surrogates;
3111 }
3112
3113 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3114 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3115 if (!_PyUnicode_WSTR(u)) {
3116 PyErr_NoMemory();
3117 return NULL;
3118 }
3119 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3120
3121 w = _PyUnicode_WSTR(u);
3122 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3123 four_bytes = PyUnicode_4BYTE_DATA(u);
3124 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3125 if (*four_bytes > 0xFFFF) {
3126 /* encode surrogate pair in this case */
3127 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3128 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3129 }
3130 else
3131 *w = *four_bytes;
3132
3133 if (w > wchar_end) {
3134 assert(0 && "Miscalculated string end");
3135 }
3136 }
3137 *w = 0;
3138#else
3139 /* sizeof(wchar_t) == 4 */
3140 Py_FatalError("Impossible unicode object state, wstr and str "
3141 "should share memory already.");
3142 return NULL;
3143#endif
3144 }
3145 else {
3146 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3147 (_PyUnicode_LENGTH(u) + 1));
3148 if (!_PyUnicode_WSTR(u)) {
3149 PyErr_NoMemory();
3150 return NULL;
3151 }
3152 if (!PyUnicode_IS_COMPACT_ASCII(u))
3153 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3154 w = _PyUnicode_WSTR(u);
3155 wchar_end = w + _PyUnicode_LENGTH(u);
3156
3157 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3158 one_byte = PyUnicode_1BYTE_DATA(u);
3159 for (; w < wchar_end; ++one_byte, ++w)
3160 *w = *one_byte;
3161 /* null-terminate the wstr */
3162 *w = 0;
3163 }
3164 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3165#if SIZEOF_WCHAR_T == 4
3166 two_bytes = PyUnicode_2BYTE_DATA(u);
3167 for (; w < wchar_end; ++two_bytes, ++w)
3168 *w = *two_bytes;
3169 /* null-terminate the wstr */
3170 *w = 0;
3171#else
3172 /* sizeof(wchar_t) == 2 */
3173 PyObject_FREE(_PyUnicode_WSTR(u));
3174 _PyUnicode_WSTR(u) = NULL;
3175 Py_FatalError("Impossible unicode object state, wstr "
3176 "and str should share memory already.");
3177 return NULL;
3178#endif
3179 }
3180 else {
3181 assert(0 && "This should never happen.");
3182 }
3183 }
3184 }
3185 if (size != NULL)
3186 *size = PyUnicode_WSTR_LENGTH(u);
3187 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003188}
3189
Alexander Belopolsky40018472011-02-26 01:02:56 +00003190Py_UNICODE *
3191PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003193 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194}
3195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197Py_ssize_t
3198PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199{
3200 if (!PyUnicode_Check(unicode)) {
3201 PyErr_BadArgument();
3202 goto onError;
3203 }
3204 return PyUnicode_GET_SIZE(unicode);
3205
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 return -1;
3208}
3209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210Py_ssize_t
3211PyUnicode_GetLength(PyObject *unicode)
3212{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003213 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214 PyErr_BadArgument();
3215 return -1;
3216 }
3217
3218 return PyUnicode_GET_LENGTH(unicode);
3219}
3220
3221Py_UCS4
3222PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3223{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003224 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3225 PyErr_BadArgument();
3226 return (Py_UCS4)-1;
3227 }
3228 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3229 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003230 return (Py_UCS4)-1;
3231 }
3232 return PyUnicode_READ_CHAR(unicode, index);
3233}
3234
3235int
3236PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3237{
3238 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003239 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003240 return -1;
3241 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003242 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3243 PyErr_SetString(PyExc_IndexError, "string index out of range");
3244 return -1;
3245 }
3246 if (_PyUnicode_Dirty(unicode))
3247 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003248 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3249 index, ch);
3250 return 0;
3251}
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253const char *
3254PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003255{
Victor Stinner42cb4622010-09-01 19:39:01 +00003256 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003257}
3258
Victor Stinner554f3f02010-06-16 23:33:54 +00003259/* create or adjust a UnicodeDecodeError */
3260static void
3261make_decode_exception(PyObject **exceptionObject,
3262 const char *encoding,
3263 const char *input, Py_ssize_t length,
3264 Py_ssize_t startpos, Py_ssize_t endpos,
3265 const char *reason)
3266{
3267 if (*exceptionObject == NULL) {
3268 *exceptionObject = PyUnicodeDecodeError_Create(
3269 encoding, input, length, startpos, endpos, reason);
3270 }
3271 else {
3272 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3273 goto onError;
3274 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3275 goto onError;
3276 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3277 goto onError;
3278 }
3279 return;
3280
3281onError:
3282 Py_DECREF(*exceptionObject);
3283 *exceptionObject = NULL;
3284}
3285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286/* error handling callback helper:
3287 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003288 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 and adjust various state variables.
3290 return 0 on success, -1 on error
3291*/
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293static int
3294unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 const char *encoding, const char *reason,
3296 const char **input, const char **inend, Py_ssize_t *startinpos,
3297 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3298 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003300 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301
3302 PyObject *restuple = NULL;
3303 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003304 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003305 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003306 Py_ssize_t requiredsize;
3307 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003308 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003309 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003310 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 int res = -1;
3312
3313 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 *errorHandler = PyCodec_LookupError(errors);
3315 if (*errorHandler == NULL)
3316 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 }
3318
Victor Stinner554f3f02010-06-16 23:33:54 +00003319 make_decode_exception(exceptionObject,
3320 encoding,
3321 *input, *inend - *input,
3322 *startinpos, *endinpos,
3323 reason);
3324 if (*exceptionObject == NULL)
3325 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326
3327 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3328 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003331 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 }
3334 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003335 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003336
3337 /* Copy back the bytes variables, which might have been modified by the
3338 callback */
3339 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3340 if (!inputobj)
3341 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003342 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003344 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003345 *input = PyBytes_AS_STRING(inputobj);
3346 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003347 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003348 /* we can DECREF safely, as the exception has another reference,
3349 so the object won't go away. */
3350 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003354 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3356 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003357 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358
3359 /* need more space? (at least enough for what we
3360 have+the replacement+the rest of the string (starting
3361 at the new input position), so we won't have to check space
3362 when there are no errors in the rest of the string) */
3363 repptr = PyUnicode_AS_UNICODE(repunicode);
3364 repsize = PyUnicode_GET_SIZE(repunicode);
3365 requiredsize = *outpos + repsize + insize-newpos;
3366 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003367 if (requiredsize<2*outsize)
3368 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003369 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 goto onError;
3371 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 }
3373 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003374 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 Py_UNICODE_COPY(*outptr, repptr, repsize);
3376 *outptr += repsize;
3377 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 /* we made it! */
3380 res = 0;
3381
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 Py_XDECREF(restuple);
3384 return res;
3385}
3386
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003387/* --- UTF-7 Codec -------------------------------------------------------- */
3388
Antoine Pitrou244651a2009-05-04 18:56:13 +00003389/* See RFC2152 for details. We encode conservatively and decode liberally. */
3390
3391/* Three simple macros defining base-64. */
3392
3393/* Is c a base-64 character? */
3394
3395#define IS_BASE64(c) \
3396 (((c) >= 'A' && (c) <= 'Z') || \
3397 ((c) >= 'a' && (c) <= 'z') || \
3398 ((c) >= '0' && (c) <= '9') || \
3399 (c) == '+' || (c) == '/')
3400
3401/* given that c is a base-64 character, what is its base-64 value? */
3402
3403#define FROM_BASE64(c) \
3404 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3405 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3406 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3407 (c) == '+' ? 62 : 63)
3408
3409/* What is the base-64 character of the bottom 6 bits of n? */
3410
3411#define TO_BASE64(n) \
3412 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3413
3414/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3415 * decoded as itself. We are permissive on decoding; the only ASCII
3416 * byte not decoding to itself is the + which begins a base64
3417 * string. */
3418
3419#define DECODE_DIRECT(c) \
3420 ((c) <= 127 && (c) != '+')
3421
3422/* The UTF-7 encoder treats ASCII characters differently according to
3423 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3424 * the above). See RFC2152. This array identifies these different
3425 * sets:
3426 * 0 : "Set D"
3427 * alphanumeric and '(),-./:?
3428 * 1 : "Set O"
3429 * !"#$%&*;<=>@[]^_`{|}
3430 * 2 : "whitespace"
3431 * ht nl cr sp
3432 * 3 : special (must be base64 encoded)
3433 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3434 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003435
Tim Petersced69f82003-09-16 20:30:58 +00003436static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003437char utf7_category[128] = {
3438/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3439 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3440/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3441 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3442/* sp ! " # $ % & ' ( ) * + , - . / */
3443 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3444/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3446/* @ A B C D E F G H I J K L M N O */
3447 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3448/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3450/* ` a b c d e f g h i j k l m n o */
3451 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3452/* p q r s t u v w x y z { | } ~ del */
3453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003454};
3455
Antoine Pitrou244651a2009-05-04 18:56:13 +00003456/* ENCODE_DIRECT: this character should be encoded as itself. The
3457 * answer depends on whether we are encoding set O as itself, and also
3458 * on whether we are encoding whitespace as itself. RFC2152 makes it
3459 * clear that the answers to these questions vary between
3460 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003461
Antoine Pitrou244651a2009-05-04 18:56:13 +00003462#define ENCODE_DIRECT(c, directO, directWS) \
3463 ((c) < 128 && (c) > 0 && \
3464 ((utf7_category[(c)] == 0) || \
3465 (directWS && (utf7_category[(c)] == 2)) || \
3466 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003467
Alexander Belopolsky40018472011-02-26 01:02:56 +00003468PyObject *
3469PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003470 Py_ssize_t size,
3471 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003472{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003473 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3474}
3475
Antoine Pitrou244651a2009-05-04 18:56:13 +00003476/* The decoder. The only state we preserve is our read position,
3477 * i.e. how many characters we have consumed. So if we end in the
3478 * middle of a shift sequence we have to back off the read position
3479 * and the output to the beginning of the sequence, otherwise we lose
3480 * all the shift state (seen bits, number of bits seen, high
3481 * surrogate). */
3482
Alexander Belopolsky40018472011-02-26 01:02:56 +00003483PyObject *
3484PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003485 Py_ssize_t size,
3486 const char *errors,
3487 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003490 Py_ssize_t startinpos;
3491 Py_ssize_t endinpos;
3492 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003493 const char *e;
3494 PyUnicodeObject *unicode;
3495 Py_UNICODE *p;
3496 const char *errmsg = "";
3497 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003498 Py_UNICODE *shiftOutStart;
3499 unsigned int base64bits = 0;
3500 unsigned long base64buffer = 0;
3501 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003504
3505 unicode = _PyUnicode_New(size);
3506 if (!unicode)
3507 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003508 if (size == 0) {
3509 if (consumed)
3510 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003511 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003512 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003515 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003516 e = s + size;
3517
3518 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003521 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003522
Antoine Pitrou244651a2009-05-04 18:56:13 +00003523 if (inShift) { /* in a base-64 section */
3524 if (IS_BASE64(ch)) { /* consume a base-64 character */
3525 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3526 base64bits += 6;
3527 s++;
3528 if (base64bits >= 16) {
3529 /* we have enough bits for a UTF-16 value */
3530 Py_UNICODE outCh = (Py_UNICODE)
3531 (base64buffer >> (base64bits-16));
3532 base64bits -= 16;
3533 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3534 if (surrogate) {
3535 /* expecting a second surrogate */
3536 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3537#ifdef Py_UNICODE_WIDE
3538 *p++ = (((surrogate & 0x3FF)<<10)
3539 | (outCh & 0x3FF)) + 0x10000;
3540#else
3541 *p++ = surrogate;
3542 *p++ = outCh;
3543#endif
3544 surrogate = 0;
3545 }
3546 else {
3547 surrogate = 0;
3548 errmsg = "second surrogate missing";
3549 goto utf7Error;
3550 }
3551 }
3552 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3553 /* first surrogate */
3554 surrogate = outCh;
3555 }
3556 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3557 errmsg = "unexpected second surrogate";
3558 goto utf7Error;
3559 }
3560 else {
3561 *p++ = outCh;
3562 }
3563 }
3564 }
3565 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003566 inShift = 0;
3567 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568 if (surrogate) {
3569 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003570 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003572 if (base64bits > 0) { /* left-over bits */
3573 if (base64bits >= 6) {
3574 /* We've seen at least one base-64 character */
3575 errmsg = "partial character in shift sequence";
3576 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003578 else {
3579 /* Some bits remain; they should be zero */
3580 if (base64buffer != 0) {
3581 errmsg = "non-zero padding bits in shift sequence";
3582 goto utf7Error;
3583 }
3584 }
3585 }
3586 if (ch != '-') {
3587 /* '-' is absorbed; other terminating
3588 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003589 *p++ = ch;
3590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003591 }
3592 }
3593 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003595 s++; /* consume '+' */
3596 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003597 s++;
3598 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003599 }
3600 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003601 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 shiftOutStart = p;
3603 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 }
3605 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003606 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003607 *p++ = ch;
3608 s++;
3609 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003610 else {
3611 startinpos = s-starts;
3612 s++;
3613 errmsg = "unexpected special character";
3614 goto utf7Error;
3615 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 outpos = p-PyUnicode_AS_UNICODE(unicode);
3619 endinpos = s-starts;
3620 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 errors, &errorHandler,
3622 "utf7", errmsg,
3623 &starts, &e, &startinpos, &endinpos, &exc, &s,
3624 &unicode, &outpos, &p))
3625 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003626 }
3627
Antoine Pitrou244651a2009-05-04 18:56:13 +00003628 /* end of string */
3629
3630 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3631 /* if we're in an inconsistent state, that's an error */
3632 if (surrogate ||
3633 (base64bits >= 6) ||
3634 (base64bits > 0 && base64buffer != 0)) {
3635 outpos = p-PyUnicode_AS_UNICODE(unicode);
3636 endinpos = size;
3637 if (unicode_decode_call_errorhandler(
3638 errors, &errorHandler,
3639 "utf7", "unterminated shift sequence",
3640 &starts, &e, &startinpos, &endinpos, &exc, &s,
3641 &unicode, &outpos, &p))
3642 goto onError;
3643 if (s < e)
3644 goto restart;
3645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003647
3648 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003649 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003650 if (inShift) {
3651 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003652 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003653 }
3654 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003655 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003656 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003657 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658
Victor Stinnerfe226c02011-10-03 03:52:20 +02003659 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003660 goto onError;
3661
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 Py_XDECREF(errorHandler);
3663 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003664 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 Py_DECREF(unicode);
3666 return NULL;
3667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003668 return (PyObject *)unicode;
3669
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 Py_XDECREF(errorHandler);
3672 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673 Py_DECREF(unicode);
3674 return NULL;
3675}
3676
3677
Alexander Belopolsky40018472011-02-26 01:02:56 +00003678PyObject *
3679PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003680 Py_ssize_t size,
3681 int base64SetO,
3682 int base64WhiteSpace,
3683 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003684{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003685 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003686 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003687 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003689 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003690 unsigned int base64bits = 0;
3691 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003692 char * out;
3693 char * start;
3694
3695 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003698 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003699 return PyErr_NoMemory();
3700
Antoine Pitrou244651a2009-05-04 18:56:13 +00003701 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003702 if (v == NULL)
3703 return NULL;
3704
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003705 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003706 for (;i < size; ++i) {
3707 Py_UNICODE ch = s[i];
3708
Antoine Pitrou244651a2009-05-04 18:56:13 +00003709 if (inShift) {
3710 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3711 /* shifting out */
3712 if (base64bits) { /* output remaining bits */
3713 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3714 base64buffer = 0;
3715 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003716 }
3717 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003718 /* Characters not in the BASE64 set implicitly unshift the sequence
3719 so no '-' is required, except if the character is itself a '-' */
3720 if (IS_BASE64(ch) || ch == '-') {
3721 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003723 *out++ = (char) ch;
3724 }
3725 else {
3726 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003727 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003728 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003729 else { /* not in a shift sequence */
3730 if (ch == '+') {
3731 *out++ = '+';
3732 *out++ = '-';
3733 }
3734 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3735 *out++ = (char) ch;
3736 }
3737 else {
3738 *out++ = '+';
3739 inShift = 1;
3740 goto encode_char;
3741 }
3742 }
3743 continue;
3744encode_char:
3745#ifdef Py_UNICODE_WIDE
3746 if (ch >= 0x10000) {
3747 /* code first surrogate */
3748 base64bits += 16;
3749 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3750 while (base64bits >= 6) {
3751 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3752 base64bits -= 6;
3753 }
3754 /* prepare second surrogate */
3755 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3756 }
3757#endif
3758 base64bits += 16;
3759 base64buffer = (base64buffer << 16) | ch;
3760 while (base64bits >= 6) {
3761 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3762 base64bits -= 6;
3763 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003764 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003765 if (base64bits)
3766 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3767 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003769 if (_PyBytes_Resize(&v, out - start) < 0)
3770 return NULL;
3771 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003772}
3773
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774#undef IS_BASE64
3775#undef FROM_BASE64
3776#undef TO_BASE64
3777#undef DECODE_DIRECT
3778#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780/* --- UTF-8 Codec -------------------------------------------------------- */
3781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003784 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3785 illegal prefix. See RFC 3629 for details */
3786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3790 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3791 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3792 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003793 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3794 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3796 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003797 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3798 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3799 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3800 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3801 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802};
3803
Alexander Belopolsky40018472011-02-26 01:02:56 +00003804PyObject *
3805PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003806 Py_ssize_t size,
3807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808{
Walter Dörwald69652032004-09-07 20:24:22 +00003809 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3810}
3811
Antoine Pitrouab868312009-01-10 15:40:25 +00003812/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3813#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3814
3815/* Mask to quickly check whether a C 'long' contains a
3816 non-ASCII, UTF8-encoded char. */
3817#if (SIZEOF_LONG == 8)
3818# define ASCII_CHAR_MASK 0x8080808080808080L
3819#elif (SIZEOF_LONG == 4)
3820# define ASCII_CHAR_MASK 0x80808080L
3821#else
3822# error C 'long' size should be either 4 or 8!
3823#endif
3824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825/* Scans a UTF-8 string and returns the maximum character to be expected,
3826 the size of the decoded unicode string and if any major errors were
3827 encountered.
3828
3829 This function does check basic UTF-8 sanity, it does however NOT CHECK
3830 if the string contains surrogates, and if all continuation bytes are
3831 within the correct ranges, these checks are performed in
3832 PyUnicode_DecodeUTF8Stateful.
3833
3834 If it sets has_errors to 1, it means the value of unicode_size and max_char
3835 will be bogus and you should not rely on useful information in them.
3836 */
3837static Py_UCS4
3838utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3839 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3840 int *has_errors)
3841{
3842 Py_ssize_t n;
3843 Py_ssize_t char_count = 0;
3844 Py_UCS4 max_char = 127, new_max;
3845 Py_UCS4 upper_bound;
3846 const unsigned char *p = (const unsigned char *)s;
3847 const unsigned char *end = p + string_size;
3848 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3849 int err = 0;
3850
3851 for (; p < end && !err; ++p, ++char_count) {
3852 /* Only check value if it's not a ASCII char... */
3853 if (*p < 0x80) {
3854 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3855 an explanation. */
3856 if (!((size_t) p & LONG_PTR_MASK)) {
3857 /* Help register allocation */
3858 register const unsigned char *_p = p;
3859 while (_p < aligned_end) {
3860 unsigned long value = *(unsigned long *) _p;
3861 if (value & ASCII_CHAR_MASK)
3862 break;
3863 _p += SIZEOF_LONG;
3864 char_count += SIZEOF_LONG;
3865 }
3866 p = _p;
3867 if (p == end)
3868 break;
3869 }
3870 }
3871 if (*p >= 0x80) {
3872 n = utf8_code_length[*p];
3873 new_max = max_char;
3874 switch (n) {
3875 /* invalid start byte */
3876 case 0:
3877 err = 1;
3878 break;
3879 case 2:
3880 /* Code points between 0x00FF and 0x07FF inclusive.
3881 Approximate the upper bound of the code point,
3882 if this flips over 255 we can be sure it will be more
3883 than 255 and the string will need 2 bytes per code coint,
3884 if it stays under or equal to 255, we can be sure 1 byte
3885 is enough.
3886 ((*p & 0b00011111) << 6) | 0b00111111 */
3887 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3888 if (max_char < upper_bound)
3889 new_max = upper_bound;
3890 /* Ensure we track at least that we left ASCII space. */
3891 if (new_max < 128)
3892 new_max = 128;
3893 break;
3894 case 3:
3895 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3896 always > 255 and <= 65535 and will always need 2 bytes. */
3897 if (max_char < 65535)
3898 new_max = 65535;
3899 break;
3900 case 4:
3901 /* Code point will be above 0xFFFF for sure in this case. */
3902 new_max = 65537;
3903 break;
3904 /* Internal error, this should be caught by the first if */
3905 case 1:
3906 default:
3907 assert(0 && "Impossible case in utf8_max_char_and_size");
3908 err = 1;
3909 }
3910 /* Instead of number of overall bytes for this code point,
3911 n containts the number of following bytes: */
3912 --n;
3913 /* Check if the follow up chars are all valid continuation bytes */
3914 if (n >= 1) {
3915 const unsigned char *cont;
3916 if ((p + n) >= end) {
3917 if (consumed == 0)
3918 /* incomplete data, non-incremental decoding */
3919 err = 1;
3920 break;
3921 }
3922 for (cont = p + 1; cont < (p + n); ++cont) {
3923 if ((*cont & 0xc0) != 0x80) {
3924 err = 1;
3925 break;
3926 }
3927 }
3928 p += n;
3929 }
3930 else
3931 err = 1;
3932 max_char = new_max;
3933 }
3934 }
3935
3936 if (unicode_size)
3937 *unicode_size = char_count;
3938 if (has_errors)
3939 *has_errors = err;
3940 return max_char;
3941}
3942
3943/* Similar to PyUnicode_WRITE but can also write into wstr field
3944 of the legacy unicode representation */
3945#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3946 do { \
3947 const int k_ = (kind); \
3948 if (k_ == PyUnicode_WCHAR_KIND) \
3949 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3950 else if (k_ == PyUnicode_1BYTE_KIND) \
3951 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3952 else if (k_ == PyUnicode_2BYTE_KIND) \
3953 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3954 else \
3955 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3956 } while (0)
3957
Alexander Belopolsky40018472011-02-26 01:02:56 +00003958PyObject *
3959PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 Py_ssize_t size,
3961 const char *errors,
3962 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003963{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003966 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003967 Py_ssize_t startinpos;
3968 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003969 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003971 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 PyObject *errorHandler = NULL;
3973 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 Py_UCS4 maxchar = 0;
3975 Py_ssize_t unicode_size;
3976 Py_ssize_t i;
3977 int kind;
3978 void *data;
3979 int has_errors;
3980 Py_UNICODE *error_outptr;
3981#if SIZEOF_WCHAR_T == 2
3982 Py_ssize_t wchar_offset = 0;
3983#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
Walter Dörwald69652032004-09-07 20:24:22 +00003985 if (size == 0) {
3986 if (consumed)
3987 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3991 consumed, &has_errors);
3992 if (has_errors) {
3993 unicode = _PyUnicode_New(size);
3994 if (!unicode)
3995 return NULL;
3996 kind = PyUnicode_WCHAR_KIND;
3997 data = PyUnicode_AS_UNICODE(unicode);
3998 assert(data != NULL);
3999 }
4000 else {
4001 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4002 if (!unicode)
4003 return NULL;
4004 /* When the string is ASCII only, just use memcpy and return.
4005 unicode_size may be != size if there is an incomplete UTF-8
4006 sequence at the end of the ASCII block. */
4007 if (maxchar < 128 && size == unicode_size) {
4008 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4009 return (PyObject *)unicode;
4010 }
4011 kind = PyUnicode_KIND(unicode);
4012 data = PyUnicode_DATA(unicode);
4013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004017 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018
4019 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004020 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021
4022 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004023 /* Fast path for runs of ASCII characters. Given that common UTF-8
4024 input will consist of an overwhelming majority of ASCII
4025 characters, we try to optimize for this case by checking
4026 as many characters as a C 'long' can contain.
4027 First, check if we can do an aligned read, as most CPUs have
4028 a penalty for unaligned reads.
4029 */
4030 if (!((size_t) s & LONG_PTR_MASK)) {
4031 /* Help register allocation */
4032 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004034 while (_s < aligned_end) {
4035 /* Read a whole long at a time (either 4 or 8 bytes),
4036 and do a fast unrolled copy if it only contains ASCII
4037 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 unsigned long value = *(unsigned long *) _s;
4039 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004040 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4042 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4043 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4044 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004045#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4047 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4048 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4049 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004050#endif
4051 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004053 }
4054 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004056 if (s == e)
4057 break;
4058 ch = (unsigned char)*s;
4059 }
4060 }
4061
4062 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 s++;
4065 continue;
4066 }
4067
4068 n = utf8_code_length[ch];
4069
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004070 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 if (consumed)
4072 break;
4073 else {
4074 errmsg = "unexpected end of data";
4075 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004076 endinpos = startinpos+1;
4077 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4078 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 goto utf8Error;
4080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082
4083 switch (n) {
4084
4085 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004086 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 startinpos = s-starts;
4088 endinpos = startinpos+1;
4089 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
4091 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004092 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 startinpos = s-starts;
4094 endinpos = startinpos+1;
4095 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096
4097 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004098 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004099 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004101 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 goto utf8Error;
4103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004105 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 break;
4108
4109 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004110 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4111 will result in surrogates in range d800-dfff. Surrogates are
4112 not valid UTF-8 so they are rejected.
4113 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4114 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004115 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004116 (s[2] & 0xc0) != 0x80 ||
4117 ((unsigned char)s[0] == 0xE0 &&
4118 (unsigned char)s[1] < 0xA0) ||
4119 ((unsigned char)s[0] == 0xED &&
4120 (unsigned char)s[1] > 0x9F)) {
4121 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004123 endinpos = startinpos + 1;
4124
4125 /* if s[1] first two bits are 1 and 0, then the invalid
4126 continuation byte is s[2], so increment endinpos by 1,
4127 if not, s[1] is invalid and endinpos doesn't need to
4128 be incremented. */
4129 if ((s[1] & 0xC0) == 0x80)
4130 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto utf8Error;
4132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004134 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004136 break;
4137
4138 case 4:
4139 if ((s[1] & 0xc0) != 0x80 ||
4140 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004141 (s[3] & 0xc0) != 0x80 ||
4142 ((unsigned char)s[0] == 0xF0 &&
4143 (unsigned char)s[1] < 0x90) ||
4144 ((unsigned char)s[0] == 0xF4 &&
4145 (unsigned char)s[1] > 0x8F)) {
4146 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004148 endinpos = startinpos + 1;
4149 if ((s[1] & 0xC0) == 0x80) {
4150 endinpos++;
4151 if ((s[2] & 0xC0) == 0x80)
4152 endinpos++;
4153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 goto utf8Error;
4155 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004156 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004157 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4158 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004160 /* If the string is flexible or we have native UCS-4, write
4161 directly.. */
4162 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4163 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 else {
4166 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168 /* translate from 10000..10FFFF to 0..FFFF */
4169 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 /* high surrogate = top 10 bits added to D800 */
4172 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4173 (Py_UNICODE)(0xD800 + (ch >> 10)));
4174
4175 /* low surrogate = bottom 10 bits added to DC00 */
4176 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4177 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4178 }
4179#if SIZEOF_WCHAR_T == 2
4180 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004181#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 }
4184 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004186
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 /* If this is not yet a resizable string, make it one.. */
4189 if (kind != PyUnicode_WCHAR_KIND) {
4190 const Py_UNICODE *u;
4191 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4192 if (!new_unicode)
4193 goto onError;
4194 u = PyUnicode_AsUnicode((PyObject *)unicode);
4195 if (!u)
4196 goto onError;
4197#if SIZEOF_WCHAR_T == 2
4198 i += wchar_offset;
4199#endif
4200 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4201 Py_DECREF(unicode);
4202 unicode = new_unicode;
4203 kind = 0;
4204 data = PyUnicode_AS_UNICODE(new_unicode);
4205 assert(data != NULL);
4206 }
4207 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "utf8", errmsg,
4211 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004212 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 /* Update data because unicode_decode_call_errorhandler might have
4215 re-created or resized the unicode object. */
4216 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 /* Ensure the unicode_size calculation above was correct: */
4220 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4221
Walter Dörwald69652032004-09-07 20:24:22 +00004222 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225 /* Adjust length and ready string when it contained errors and
4226 is of the old resizable kind. */
4227 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004228 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004229 goto onError;
4230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 Py_XDECREF(errorHandler);
4233 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004234 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235 Py_DECREF(unicode);
4236 return NULL;
4237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 return (PyObject *)unicode;
4239
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 Py_XDECREF(errorHandler);
4242 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 Py_DECREF(unicode);
4244 return NULL;
4245}
4246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004248
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004249#ifdef __APPLE__
4250
4251/* Simplified UTF-8 decoder using surrogateescape error handler,
4252 used to decode the command line arguments on Mac OS X. */
4253
4254wchar_t*
4255_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4256{
4257 int n;
4258 const char *e;
4259 wchar_t *unicode, *p;
4260
4261 /* Note: size will always be longer than the resulting Unicode
4262 character count */
4263 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4264 PyErr_NoMemory();
4265 return NULL;
4266 }
4267 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4268 if (!unicode)
4269 return NULL;
4270
4271 /* Unpack UTF-8 encoded data */
4272 p = unicode;
4273 e = s + size;
4274 while (s < e) {
4275 Py_UCS4 ch = (unsigned char)*s;
4276
4277 if (ch < 0x80) {
4278 *p++ = (wchar_t)ch;
4279 s++;
4280 continue;
4281 }
4282
4283 n = utf8_code_length[ch];
4284 if (s + n > e) {
4285 goto surrogateescape;
4286 }
4287
4288 switch (n) {
4289 case 0:
4290 case 1:
4291 goto surrogateescape;
4292
4293 case 2:
4294 if ((s[1] & 0xc0) != 0x80)
4295 goto surrogateescape;
4296 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4297 assert ((ch > 0x007F) && (ch <= 0x07FF));
4298 *p++ = (wchar_t)ch;
4299 break;
4300
4301 case 3:
4302 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4303 will result in surrogates in range d800-dfff. Surrogates are
4304 not valid UTF-8 so they are rejected.
4305 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4306 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4307 if ((s[1] & 0xc0) != 0x80 ||
4308 (s[2] & 0xc0) != 0x80 ||
4309 ((unsigned char)s[0] == 0xE0 &&
4310 (unsigned char)s[1] < 0xA0) ||
4311 ((unsigned char)s[0] == 0xED &&
4312 (unsigned char)s[1] > 0x9F)) {
4313
4314 goto surrogateescape;
4315 }
4316 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4317 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004318 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004319 break;
4320
4321 case 4:
4322 if ((s[1] & 0xc0) != 0x80 ||
4323 (s[2] & 0xc0) != 0x80 ||
4324 (s[3] & 0xc0) != 0x80 ||
4325 ((unsigned char)s[0] == 0xF0 &&
4326 (unsigned char)s[1] < 0x90) ||
4327 ((unsigned char)s[0] == 0xF4 &&
4328 (unsigned char)s[1] > 0x8F)) {
4329 goto surrogateescape;
4330 }
4331 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4332 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4333 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4334
4335#if SIZEOF_WCHAR_T == 4
4336 *p++ = (wchar_t)ch;
4337#else
4338 /* compute and append the two surrogates: */
4339
4340 /* translate from 10000..10FFFF to 0..FFFF */
4341 ch -= 0x10000;
4342
4343 /* high surrogate = top 10 bits added to D800 */
4344 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4345
4346 /* low surrogate = bottom 10 bits added to DC00 */
4347 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4348#endif
4349 break;
4350 }
4351 s += n;
4352 continue;
4353
4354 surrogateescape:
4355 *p++ = 0xDC00 + ch;
4356 s++;
4357 }
4358 *p = L'\0';
4359 return unicode;
4360}
4361
4362#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364/* Primary internal function which creates utf8 encoded bytes objects.
4365
4366 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004367 and allocate exactly as much space needed at the end. Else allocate the
4368 maximum possible needed (4 result bytes per Unicode character), and return
4369 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004370*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004371PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373{
Tim Peters602f7402002-04-27 18:03:26 +00004374#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004375
Guido van Rossum98297ee2007-11-06 21:34:58 +00004376 Py_ssize_t i; /* index into s of next input byte */
4377 PyObject *result; /* result string object */
4378 char *p; /* next free byte in output buffer */
4379 Py_ssize_t nallocated; /* number of result bytes allocated */
4380 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004381 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004382 PyObject *errorHandler = NULL;
4383 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 int kind;
4385 void *data;
4386 Py_ssize_t size;
4387 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4388#if SIZEOF_WCHAR_T == 2
4389 Py_ssize_t wchar_offset = 0;
4390#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 if (!PyUnicode_Check(unicode)) {
4393 PyErr_BadArgument();
4394 return NULL;
4395 }
4396
4397 if (PyUnicode_READY(unicode) == -1)
4398 return NULL;
4399
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004400 if (PyUnicode_UTF8(unicode))
4401 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4402 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004403
4404 kind = PyUnicode_KIND(unicode);
4405 data = PyUnicode_DATA(unicode);
4406 size = PyUnicode_GET_LENGTH(unicode);
4407
Tim Peters602f7402002-04-27 18:03:26 +00004408 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
Tim Peters602f7402002-04-27 18:03:26 +00004410 if (size <= MAX_SHORT_UNICHARS) {
4411 /* Write into the stack buffer; nallocated can't overflow.
4412 * At the end, we'll allocate exactly as much heap space as it
4413 * turns out we need.
4414 */
4415 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004416 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004417 p = stackbuf;
4418 }
4419 else {
4420 /* Overallocate on the heap, and give the excess back at the end. */
4421 nallocated = size * 4;
4422 if (nallocated / 4 != size) /* overflow! */
4423 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004424 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004425 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004426 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004427 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004428 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004429
Tim Peters602f7402002-04-27 18:03:26 +00004430 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004431 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004432
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004433 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004434 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004438 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004439 *p++ = (char)(0xc0 | (ch >> 6));
4440 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004441 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004442 Py_ssize_t newpos;
4443 PyObject *rep;
4444 Py_ssize_t repsize, k, startpos;
4445 startpos = i-1;
4446#if SIZEOF_WCHAR_T == 2
4447 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004448#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004449 rep = unicode_encode_call_errorhandler(
4450 errors, &errorHandler, "utf-8", "surrogates not allowed",
4451 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4452 &exc, startpos, startpos+1, &newpos);
4453 if (!rep)
4454 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004456 if (PyBytes_Check(rep))
4457 repsize = PyBytes_GET_SIZE(rep);
4458 else
4459 repsize = PyUnicode_GET_SIZE(rep);
4460
4461 if (repsize > 4) {
4462 Py_ssize_t offset;
4463
4464 if (result == NULL)
4465 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004466 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4470 /* integer overflow */
4471 PyErr_NoMemory();
4472 goto error;
4473 }
4474 nallocated += repsize - 4;
4475 if (result != NULL) {
4476 if (_PyBytes_Resize(&result, nallocated) < 0)
4477 goto error;
4478 } else {
4479 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004480 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 goto error;
4482 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4483 }
4484 p = PyBytes_AS_STRING(result) + offset;
4485 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004487 if (PyBytes_Check(rep)) {
4488 char *prep = PyBytes_AS_STRING(rep);
4489 for(k = repsize; k > 0; k--)
4490 *p++ = *prep++;
4491 } else /* rep is unicode */ {
4492 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4493 Py_UNICODE c;
4494
4495 for(k=0; k<repsize; k++) {
4496 c = prep[k];
4497 if (0x80 <= c) {
4498 raise_encode_exception(&exc, "utf-8",
4499 PyUnicode_AS_UNICODE(unicode),
4500 size, i-1, i,
4501 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004502 goto error;
4503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004504 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004505 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004508 } else if (ch < 0x10000) {
4509 *p++ = (char)(0xe0 | (ch >> 12));
4510 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4511 *p++ = (char)(0x80 | (ch & 0x3f));
4512 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004513 /* Encode UCS4 Unicode ordinals */
4514 *p++ = (char)(0xf0 | (ch >> 18));
4515 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4516 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4517 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004518#if SIZEOF_WCHAR_T == 2
4519 wchar_offset++;
4520#endif
Tim Peters602f7402002-04-27 18:03:26 +00004521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004523
Guido van Rossum98297ee2007-11-06 21:34:58 +00004524 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004525 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004526 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004527 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004528 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004529 }
4530 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004531 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004532 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004533 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004534 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004537 Py_XDECREF(errorHandler);
4538 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004539 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004540 error:
4541 Py_XDECREF(errorHandler);
4542 Py_XDECREF(exc);
4543 Py_XDECREF(result);
4544 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004545
Tim Peters602f7402002-04-27 18:03:26 +00004546#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547}
4548
Alexander Belopolsky40018472011-02-26 01:02:56 +00004549PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004550PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4551 Py_ssize_t size,
4552 const char *errors)
4553{
4554 PyObject *v, *unicode;
4555
4556 unicode = PyUnicode_FromUnicode(s, size);
4557 if (unicode == NULL)
4558 return NULL;
4559 v = _PyUnicode_AsUTF8String(unicode, errors);
4560 Py_DECREF(unicode);
4561 return v;
4562}
4563
4564PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004565PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568}
4569
Walter Dörwald41980ca2007-08-16 21:55:45 +00004570/* --- UTF-32 Codec ------------------------------------------------------- */
4571
4572PyObject *
4573PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 Py_ssize_t size,
4575 const char *errors,
4576 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004577{
4578 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4579}
4580
4581PyObject *
4582PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 Py_ssize_t size,
4584 const char *errors,
4585 int *byteorder,
4586 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004587{
4588 const char *starts = s;
4589 Py_ssize_t startinpos;
4590 Py_ssize_t endinpos;
4591 Py_ssize_t outpos;
4592 PyUnicodeObject *unicode;
4593 Py_UNICODE *p;
4594#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004595 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004596 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004597#else
4598 const int pairs = 0;
4599#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004600 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004601 int bo = 0; /* assume native ordering by default */
4602 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004603 /* Offsets from q for retrieving bytes in the right order. */
4604#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4605 int iorder[] = {0, 1, 2, 3};
4606#else
4607 int iorder[] = {3, 2, 1, 0};
4608#endif
4609 PyObject *errorHandler = NULL;
4610 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004611
Walter Dörwald41980ca2007-08-16 21:55:45 +00004612 q = (unsigned char *)s;
4613 e = q + size;
4614
4615 if (byteorder)
4616 bo = *byteorder;
4617
4618 /* Check for BOM marks (U+FEFF) in the input and adjust current
4619 byte order setting accordingly. In native mode, the leading BOM
4620 mark is skipped, in all other modes, it is copied to the output
4621 stream as-is (giving a ZWNBSP character). */
4622 if (bo == 0) {
4623 if (size >= 4) {
4624 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004626#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 if (bom == 0x0000FEFF) {
4628 q += 4;
4629 bo = -1;
4630 }
4631 else if (bom == 0xFFFE0000) {
4632 q += 4;
4633 bo = 1;
4634 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 if (bom == 0x0000FEFF) {
4637 q += 4;
4638 bo = 1;
4639 }
4640 else if (bom == 0xFFFE0000) {
4641 q += 4;
4642 bo = -1;
4643 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004644#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 }
4647
4648 if (bo == -1) {
4649 /* force LE */
4650 iorder[0] = 0;
4651 iorder[1] = 1;
4652 iorder[2] = 2;
4653 iorder[3] = 3;
4654 }
4655 else if (bo == 1) {
4656 /* force BE */
4657 iorder[0] = 3;
4658 iorder[1] = 2;
4659 iorder[2] = 1;
4660 iorder[3] = 0;
4661 }
4662
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004663 /* On narrow builds we split characters outside the BMP into two
4664 codepoints => count how much extra space we need. */
4665#ifndef Py_UNICODE_WIDE
4666 for (qq = q; qq < e; qq += 4)
4667 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4668 pairs++;
4669#endif
4670
4671 /* This might be one to much, because of a BOM */
4672 unicode = _PyUnicode_New((size+3)/4+pairs);
4673 if (!unicode)
4674 return NULL;
4675 if (size == 0)
4676 return (PyObject *)unicode;
4677
4678 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004679 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004680
Walter Dörwald41980ca2007-08-16 21:55:45 +00004681 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 Py_UCS4 ch;
4683 /* remaining bytes at the end? (size should be divisible by 4) */
4684 if (e-q<4) {
4685 if (consumed)
4686 break;
4687 errmsg = "truncated data";
4688 startinpos = ((const char *)q)-starts;
4689 endinpos = ((const char *)e)-starts;
4690 goto utf32Error;
4691 /* The remaining input chars are ignored if the callback
4692 chooses to skip the input */
4693 }
4694 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4695 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004696
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 if (ch >= 0x110000)
4698 {
4699 errmsg = "codepoint not in range(0x110000)";
4700 startinpos = ((const char *)q)-starts;
4701 endinpos = startinpos+4;
4702 goto utf32Error;
4703 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004704#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 if (ch >= 0x10000)
4706 {
4707 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4708 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4709 }
4710 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004711#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 *p++ = ch;
4713 q += 4;
4714 continue;
4715 utf32Error:
4716 outpos = p-PyUnicode_AS_UNICODE(unicode);
4717 if (unicode_decode_call_errorhandler(
4718 errors, &errorHandler,
4719 "utf32", errmsg,
4720 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4721 &unicode, &outpos, &p))
4722 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723 }
4724
4725 if (byteorder)
4726 *byteorder = bo;
4727
4728 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004730
4731 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004732 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004733 goto onError;
4734
4735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004737 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004738 Py_DECREF(unicode);
4739 return NULL;
4740 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741 return (PyObject *)unicode;
4742
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744 Py_DECREF(unicode);
4745 Py_XDECREF(errorHandler);
4746 Py_XDECREF(exc);
4747 return NULL;
4748}
4749
4750PyObject *
4751PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 Py_ssize_t size,
4753 const char *errors,
4754 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004755{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004756 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004758 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004759#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004760 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004761#else
4762 const int pairs = 0;
4763#endif
4764 /* Offsets from p for storing byte pairs in the right order. */
4765#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4766 int iorder[] = {0, 1, 2, 3};
4767#else
4768 int iorder[] = {3, 2, 1, 0};
4769#endif
4770
Benjamin Peterson29060642009-01-31 22:14:21 +00004771#define STORECHAR(CH) \
4772 do { \
4773 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4774 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4775 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4776 p[iorder[0]] = (CH) & 0xff; \
4777 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004778 } while(0)
4779
4780 /* In narrow builds we can output surrogate pairs as one codepoint,
4781 so we need less space. */
4782#ifndef Py_UNICODE_WIDE
4783 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4785 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4786 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004787#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004788 nsize = (size - pairs + (byteorder == 0));
4789 bytesize = nsize * 4;
4790 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004792 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004793 if (v == NULL)
4794 return NULL;
4795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004796 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004797 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004800 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004801
4802 if (byteorder == -1) {
4803 /* force LE */
4804 iorder[0] = 0;
4805 iorder[1] = 1;
4806 iorder[2] = 2;
4807 iorder[3] = 3;
4808 }
4809 else if (byteorder == 1) {
4810 /* force BE */
4811 iorder[0] = 3;
4812 iorder[1] = 2;
4813 iorder[2] = 1;
4814 iorder[3] = 0;
4815 }
4816
4817 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004819#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4821 Py_UCS4 ch2 = *s;
4822 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4823 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4824 s++;
4825 size--;
4826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004827 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828#endif
4829 STORECHAR(ch);
4830 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004831
4832 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004833 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834#undef STORECHAR
4835}
4836
Alexander Belopolsky40018472011-02-26 01:02:56 +00004837PyObject *
4838PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839{
4840 if (!PyUnicode_Check(unicode)) {
4841 PyErr_BadArgument();
4842 return NULL;
4843 }
4844 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 PyUnicode_GET_SIZE(unicode),
4846 NULL,
4847 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848}
4849
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850/* --- UTF-16 Codec ------------------------------------------------------- */
4851
Tim Peters772747b2001-08-09 22:21:55 +00004852PyObject *
4853PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 Py_ssize_t size,
4855 const char *errors,
4856 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857{
Walter Dörwald69652032004-09-07 20:24:22 +00004858 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4859}
4860
Antoine Pitrouab868312009-01-10 15:40:25 +00004861/* Two masks for fast checking of whether a C 'long' may contain
4862 UTF16-encoded surrogate characters. This is an efficient heuristic,
4863 assuming that non-surrogate characters with a code point >= 0x8000 are
4864 rare in most input.
4865 FAST_CHAR_MASK is used when the input is in native byte ordering,
4866 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004867*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004868#if (SIZEOF_LONG == 8)
4869# define FAST_CHAR_MASK 0x8000800080008000L
4870# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4871#elif (SIZEOF_LONG == 4)
4872# define FAST_CHAR_MASK 0x80008000L
4873# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4874#else
4875# error C 'long' size should be either 4 or 8!
4876#endif
4877
Walter Dörwald69652032004-09-07 20:24:22 +00004878PyObject *
4879PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 Py_ssize_t size,
4881 const char *errors,
4882 int *byteorder,
4883 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004886 Py_ssize_t startinpos;
4887 Py_ssize_t endinpos;
4888 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 PyUnicodeObject *unicode;
4890 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004891 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004892 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004893 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004894 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004895 /* Offsets from q for retrieving byte pairs in the right order. */
4896#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4897 int ihi = 1, ilo = 0;
4898#else
4899 int ihi = 0, ilo = 1;
4900#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 PyObject *errorHandler = NULL;
4902 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903
4904 /* Note: size will always be longer than the resulting Unicode
4905 character count */
4906 unicode = _PyUnicode_New(size);
4907 if (!unicode)
4908 return NULL;
4909 if (size == 0)
4910 return (PyObject *)unicode;
4911
4912 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004914 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004915 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916
4917 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004918 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004920 /* Check for BOM marks (U+FEFF) in the input and adjust current
4921 byte order setting accordingly. In native mode, the leading BOM
4922 mark is skipped, in all other modes, it is copied to the output
4923 stream as-is (giving a ZWNBSP character). */
4924 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004925 if (size >= 2) {
4926 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004927#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 if (bom == 0xFEFF) {
4929 q += 2;
4930 bo = -1;
4931 }
4932 else if (bom == 0xFFFE) {
4933 q += 2;
4934 bo = 1;
4935 }
Tim Petersced69f82003-09-16 20:30:58 +00004936#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (bom == 0xFEFF) {
4938 q += 2;
4939 bo = 1;
4940 }
4941 else if (bom == 0xFFFE) {
4942 q += 2;
4943 bo = -1;
4944 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004945#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948
Tim Peters772747b2001-08-09 22:21:55 +00004949 if (bo == -1) {
4950 /* force LE */
4951 ihi = 1;
4952 ilo = 0;
4953 }
4954 else if (bo == 1) {
4955 /* force BE */
4956 ihi = 0;
4957 ilo = 1;
4958 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4960 native_ordering = ilo < ihi;
4961#else
4962 native_ordering = ilo > ihi;
4963#endif
Tim Peters772747b2001-08-09 22:21:55 +00004964
Antoine Pitrouab868312009-01-10 15:40:25 +00004965 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004966 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004968 /* First check for possible aligned read of a C 'long'. Unaligned
4969 reads are more expensive, better to defer to another iteration. */
4970 if (!((size_t) q & LONG_PTR_MASK)) {
4971 /* Fast path for runs of non-surrogate chars. */
4972 register const unsigned char *_q = q;
4973 Py_UNICODE *_p = p;
4974 if (native_ordering) {
4975 /* Native ordering is simple: as long as the input cannot
4976 possibly contain a surrogate char, do an unrolled copy
4977 of several 16-bit code points to the target object.
4978 The non-surrogate check is done on several input bytes
4979 at a time (as many as a C 'long' can contain). */
4980 while (_q < aligned_end) {
4981 unsigned long data = * (unsigned long *) _q;
4982 if (data & FAST_CHAR_MASK)
4983 break;
4984 _p[0] = ((unsigned short *) _q)[0];
4985 _p[1] = ((unsigned short *) _q)[1];
4986#if (SIZEOF_LONG == 8)
4987 _p[2] = ((unsigned short *) _q)[2];
4988 _p[3] = ((unsigned short *) _q)[3];
4989#endif
4990 _q += SIZEOF_LONG;
4991 _p += SIZEOF_LONG / 2;
4992 }
4993 }
4994 else {
4995 /* Byteswapped ordering is similar, but we must decompose
4996 the copy bytewise, and take care of zero'ing out the
4997 upper bytes if the target object is in 32-bit units
4998 (that is, in UCS-4 builds). */
4999 while (_q < aligned_end) {
5000 unsigned long data = * (unsigned long *) _q;
5001 if (data & SWAPPED_FAST_CHAR_MASK)
5002 break;
5003 /* Zero upper bytes in UCS-4 builds */
5004#if (Py_UNICODE_SIZE > 2)
5005 _p[0] = 0;
5006 _p[1] = 0;
5007#if (SIZEOF_LONG == 8)
5008 _p[2] = 0;
5009 _p[3] = 0;
5010#endif
5011#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005012 /* Issue #4916; UCS-4 builds on big endian machines must
5013 fill the two last bytes of each 4-byte unit. */
5014#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5015# define OFF 2
5016#else
5017# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005018#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005019 ((unsigned char *) _p)[OFF + 1] = _q[0];
5020 ((unsigned char *) _p)[OFF + 0] = _q[1];
5021 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5022 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5023#if (SIZEOF_LONG == 8)
5024 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5025 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5026 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5027 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5028#endif
5029#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005030 _q += SIZEOF_LONG;
5031 _p += SIZEOF_LONG / 2;
5032 }
5033 }
5034 p = _p;
5035 q = _q;
5036 if (q >= e)
5037 break;
5038 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040
Benjamin Peterson14339b62009-01-31 16:36:08 +00005041 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005042
5043 if (ch < 0xD800 || ch > 0xDFFF) {
5044 *p++ = ch;
5045 continue;
5046 }
5047
5048 /* UTF-16 code pair: */
5049 if (q > e) {
5050 errmsg = "unexpected end of data";
5051 startinpos = (((const char *)q) - 2) - starts;
5052 endinpos = ((const char *)e) + 1 - starts;
5053 goto utf16Error;
5054 }
5055 if (0xD800 <= ch && ch <= 0xDBFF) {
5056 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5057 q += 2;
5058 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005059#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 *p++ = ch;
5061 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005062#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005064#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 continue;
5066 }
5067 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005068 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 startinpos = (((const char *)q)-4)-starts;
5070 endinpos = startinpos+2;
5071 goto utf16Error;
5072 }
5073
Benjamin Peterson14339b62009-01-31 16:36:08 +00005074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 errmsg = "illegal encoding";
5076 startinpos = (((const char *)q)-2)-starts;
5077 endinpos = startinpos+2;
5078 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005079
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 utf16Error:
5081 outpos = p - PyUnicode_AS_UNICODE(unicode);
5082 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005083 errors,
5084 &errorHandler,
5085 "utf16", errmsg,
5086 &starts,
5087 (const char **)&e,
5088 &startinpos,
5089 &endinpos,
5090 &exc,
5091 (const char **)&q,
5092 &unicode,
5093 &outpos,
5094 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005097 /* remaining byte at the end? (size should be even) */
5098 if (e == q) {
5099 if (!consumed) {
5100 errmsg = "truncated data";
5101 startinpos = ((const char *)q) - starts;
5102 endinpos = ((const char *)e) + 1 - starts;
5103 outpos = p - PyUnicode_AS_UNICODE(unicode);
5104 if (unicode_decode_call_errorhandler(
5105 errors,
5106 &errorHandler,
5107 "utf16", errmsg,
5108 &starts,
5109 (const char **)&e,
5110 &startinpos,
5111 &endinpos,
5112 &exc,
5113 (const char **)&q,
5114 &unicode,
5115 &outpos,
5116 &p))
5117 goto onError;
5118 /* The remaining input chars are ignored if the callback
5119 chooses to skip the input */
5120 }
5121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122
5123 if (byteorder)
5124 *byteorder = bo;
5125
Walter Dörwald69652032004-09-07 20:24:22 +00005126 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005128
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005130 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 goto onError;
5132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005133 Py_XDECREF(errorHandler);
5134 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005135 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136 Py_DECREF(unicode);
5137 return NULL;
5138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 return (PyObject *)unicode;
5140
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 Py_XDECREF(errorHandler);
5144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 return NULL;
5146}
5147
Antoine Pitrouab868312009-01-10 15:40:25 +00005148#undef FAST_CHAR_MASK
5149#undef SWAPPED_FAST_CHAR_MASK
5150
Tim Peters772747b2001-08-09 22:21:55 +00005151PyObject *
5152PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 Py_ssize_t size,
5154 const char *errors,
5155 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005157 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005158 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005159 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005160#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005161 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005162#else
5163 const int pairs = 0;
5164#endif
Tim Peters772747b2001-08-09 22:21:55 +00005165 /* Offsets from p for storing byte pairs in the right order. */
5166#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5167 int ihi = 1, ilo = 0;
5168#else
5169 int ihi = 0, ilo = 1;
5170#endif
5171
Benjamin Peterson29060642009-01-31 22:14:21 +00005172#define STORECHAR(CH) \
5173 do { \
5174 p[ihi] = ((CH) >> 8) & 0xff; \
5175 p[ilo] = (CH) & 0xff; \
5176 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005177 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005179#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005180 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 if (s[i] >= 0x10000)
5182 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005183#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005184 /* 2 * (size + pairs + (byteorder == 0)) */
5185 if (size > PY_SSIZE_T_MAX ||
5186 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005188 nsize = size + pairs + (byteorder == 0);
5189 bytesize = nsize * 2;
5190 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005192 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 if (v == NULL)
5194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005196 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005198 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005199 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005200 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005201
5202 if (byteorder == -1) {
5203 /* force LE */
5204 ihi = 1;
5205 ilo = 0;
5206 }
5207 else if (byteorder == 1) {
5208 /* force BE */
5209 ihi = 0;
5210 ilo = 1;
5211 }
5212
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005213 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 Py_UNICODE ch = *s++;
5215 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005216#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 if (ch >= 0x10000) {
5218 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5219 ch = 0xD800 | ((ch-0x10000) >> 10);
5220 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005221#endif
Tim Peters772747b2001-08-09 22:21:55 +00005222 STORECHAR(ch);
5223 if (ch2)
5224 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005225 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005226
5227 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005228 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005229#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230}
5231
Alexander Belopolsky40018472011-02-26 01:02:56 +00005232PyObject *
5233PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
5235 if (!PyUnicode_Check(unicode)) {
5236 PyErr_BadArgument();
5237 return NULL;
5238 }
5239 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 PyUnicode_GET_SIZE(unicode),
5241 NULL,
5242 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243}
5244
5245/* --- Unicode Escape Codec ----------------------------------------------- */
5246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005247/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5248 if all the escapes in the string make it still a valid ASCII string.
5249 Returns -1 if any escapes were found which cause the string to
5250 pop out of ASCII range. Otherwise returns the length of the
5251 required buffer to hold the string.
5252 */
5253Py_ssize_t
5254length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5255{
5256 const unsigned char *p = (const unsigned char *)s;
5257 const unsigned char *end = p + size;
5258 Py_ssize_t length = 0;
5259
5260 if (size < 0)
5261 return -1;
5262
5263 for (; p < end; ++p) {
5264 if (*p > 127) {
5265 /* Non-ASCII */
5266 return -1;
5267 }
5268 else if (*p != '\\') {
5269 /* Normal character */
5270 ++length;
5271 }
5272 else {
5273 /* Backslash-escape, check next char */
5274 ++p;
5275 /* Escape sequence reaches till end of string or
5276 non-ASCII follow-up. */
5277 if (p >= end || *p > 127)
5278 return -1;
5279 switch (*p) {
5280 case '\n':
5281 /* backslash + \n result in zero characters */
5282 break;
5283 case '\\': case '\'': case '\"':
5284 case 'b': case 'f': case 't':
5285 case 'n': case 'r': case 'v': case 'a':
5286 ++length;
5287 break;
5288 case '0': case '1': case '2': case '3':
5289 case '4': case '5': case '6': case '7':
5290 case 'x': case 'u': case 'U': case 'N':
5291 /* these do not guarantee ASCII characters */
5292 return -1;
5293 default:
5294 /* count the backslash + the other character */
5295 length += 2;
5296 }
5297 }
5298 }
5299 return length;
5300}
5301
5302/* Similar to PyUnicode_WRITE but either write into wstr field
5303 or treat string as ASCII. */
5304#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5305 do { \
5306 if ((kind) != PyUnicode_WCHAR_KIND) \
5307 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5308 else \
5309 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5310 } while (0)
5311
5312#define WRITE_WSTR(buf, index, value) \
5313 assert(kind == PyUnicode_WCHAR_KIND), \
5314 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5315
5316
Fredrik Lundh06d12682001-01-24 07:59:11 +00005317static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005318
Alexander Belopolsky40018472011-02-26 01:02:56 +00005319PyObject *
5320PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005321 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005322 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005325 Py_ssize_t startinpos;
5326 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005327 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005331 char* message;
5332 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333 PyObject *errorHandler = NULL;
5334 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005335 Py_ssize_t ascii_length;
5336 Py_ssize_t i;
5337 int kind;
5338 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005340 ascii_length = length_of_escaped_ascii_string(s, size);
5341
5342 /* After length_of_escaped_ascii_string() there are two alternatives,
5343 either the string is pure ASCII with named escapes like \n, etc.
5344 and we determined it's exact size (common case)
5345 or it contains \x, \u, ... escape sequences. then we create a
5346 legacy wchar string and resize it at the end of this function. */
5347 if (ascii_length >= 0) {
5348 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5349 if (!v)
5350 goto onError;
5351 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5352 kind = PyUnicode_1BYTE_KIND;
5353 data = PyUnicode_DATA(v);
5354 }
5355 else {
5356 /* Escaped strings will always be longer than the resulting
5357 Unicode string, so we start with size here and then reduce the
5358 length after conversion to the true value.
5359 (but if the error callback returns a long replacement string
5360 we'll have to allocate more space) */
5361 v = _PyUnicode_New(size);
5362 if (!v)
5363 goto onError;
5364 kind = PyUnicode_WCHAR_KIND;
5365 data = PyUnicode_AS_UNICODE(v);
5366 }
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 if (size == 0)
5369 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005370 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 while (s < end) {
5374 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005375 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 if (kind == PyUnicode_WCHAR_KIND) {
5379 assert(i < _PyUnicode_WSTR_LENGTH(v));
5380 }
5381 else {
5382 /* The only case in which i == ascii_length is a backslash
5383 followed by a newline. */
5384 assert(i <= ascii_length);
5385 }
5386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 /* Non-escape characters are interpreted as Unicode ordinals */
5388 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 continue;
5391 }
5392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 /* \ - Escapes */
5395 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005396 c = *s++;
5397 if (s > end)
5398 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005399
5400 if (kind == PyUnicode_WCHAR_KIND) {
5401 assert(i < _PyUnicode_WSTR_LENGTH(v));
5402 }
5403 else {
5404 /* The only case in which i == ascii_length is a backslash
5405 followed by a newline. */
5406 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5407 }
5408
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005409 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005413 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5414 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5415 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5416 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5417 /* FF */
5418 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5419 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5420 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5421 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5422 /* VT */
5423 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5424 /* BEL, not classic C */
5425 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 case '0': case '1': case '2': case '3':
5429 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005430 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005431 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005432 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005433 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005434 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 break;
5438
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 /* hex escapes */
5440 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005442 digits = 2;
5443 message = "truncated \\xXX escape";
5444 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005448 digits = 4;
5449 message = "truncated \\uXXXX escape";
5450 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005453 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005454 digits = 8;
5455 message = "truncated \\UXXXXXXXX escape";
5456 hexescape:
5457 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005458 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 if (s+digits>end) {
5460 endinpos = size;
5461 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 errors, &errorHandler,
5463 "unicodeescape", "end of string in escape sequence",
5464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 goto nextByte;
5469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 for (j = 0; j < digits; ++j) {
5471 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005472 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 endinpos = (s+j+1)-starts;
5474 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 errors, &errorHandler,
5477 "unicodeescape", message,
5478 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005480 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005483 }
5484 chr = (chr<<4) & ~0xF;
5485 if (c >= '0' && c <= '9')
5486 chr += c - '0';
5487 else if (c >= 'a' && c <= 'f')
5488 chr += 10 + c - 'a';
5489 else
5490 chr += 10 + c - 'A';
5491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005493 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 /* _decoding_error will have already written into the
5495 target buffer. */
5496 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005497 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005498 /* when we get here, chr is a 32-bit unicode character */
5499 if (chr <= 0xffff)
5500 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005502 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005503 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005504 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005505#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005506 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005507#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005508 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5510 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005511#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005512 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 errors, &errorHandler,
5517 "unicodeescape", "illegal Unicode character",
5518 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005520 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005522 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523 break;
5524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005526 case 'N':
5527 message = "malformed \\N character escape";
5528 if (ucnhash_CAPI == NULL) {
5529 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5531 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005532 if (ucnhash_CAPI == NULL)
5533 goto ucnhashError;
5534 }
5535 if (*s == '{') {
5536 const char *start = s+1;
5537 /* look for the closing brace */
5538 while (*s != '}' && s < end)
5539 s++;
5540 if (s > start && s < end && *s == '}') {
5541 /* found a name. look it up in the unicode database */
5542 message = "unknown Unicode character name";
5543 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5545 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 goto store;
5547 }
5548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 errors, &errorHandler,
5553 "unicodeescape", message,
5554 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005556 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005558 break;
5559
5560 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005561 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 message = "\\ at end of string";
5564 s--;
5565 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 errors, &errorHandler,
5569 "unicodeescape", message,
5570 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005572 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005574 }
5575 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5577 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005578 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005579 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584 /* Ensure the length prediction worked in case of ASCII strings */
5585 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5586
Victor Stinnerfe226c02011-10-03 03:52:20 +02005587 if (kind == PyUnicode_WCHAR_KIND)
5588 {
5589 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5590 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005591 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005592 Py_XDECREF(errorHandler);
5593 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005594 if (_PyUnicode_READY_REPLACE(&v)) {
5595 Py_DECREF(v);
5596 return NULL;
5597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005599
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005601 PyErr_SetString(
5602 PyExc_UnicodeError,
5603 "\\N escapes not supported (can't load unicodedata module)"
5604 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 Py_XDECREF(errorHandler);
5607 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005608 return NULL;
5609
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 Py_XDECREF(errorHandler);
5613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 return NULL;
5615}
5616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617#undef WRITE_ASCII_OR_WSTR
5618#undef WRITE_WSTR
5619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620/* Return a Unicode-Escape string version of the Unicode object.
5621
5622 If quotes is true, the string is enclosed in u"" or u'' quotes as
5623 appropriate.
5624
5625*/
5626
Walter Dörwald79e913e2007-05-12 11:08:06 +00005627static const char *hexdigits = "0123456789abcdef";
5628
Alexander Belopolsky40018472011-02-26 01:02:56 +00005629PyObject *
5630PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005631 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005633 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005636#ifdef Py_UNICODE_WIDE
5637 const Py_ssize_t expandsize = 10;
5638#else
5639 const Py_ssize_t expandsize = 6;
5640#endif
5641
Thomas Wouters89f507f2006-12-13 04:49:30 +00005642 /* XXX(nnorwitz): rather than over-allocating, it would be
5643 better to choose a different scheme. Perhaps scan the
5644 first N-chars of the string and allocate based on that size.
5645 */
5646 /* Initial allocation is based on the longest-possible unichr
5647 escape.
5648
5649 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5650 unichr, so in this case it's the longest unichr escape. In
5651 narrow (UTF-16) builds this is five chars per source unichr
5652 since there are two unichrs in the surrogate pair, so in narrow
5653 (UTF-16) builds it's not the longest unichr escape.
5654
5655 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5656 so in the narrow (UTF-16) build case it's the longest unichr
5657 escape.
5658 */
5659
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005660 if (size == 0)
5661 return PyBytes_FromStringAndSize(NULL, 0);
5662
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005663 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005665
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005666 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 2
5668 + expandsize*size
5669 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 if (repr == NULL)
5671 return NULL;
5672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005673 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 while (size-- > 0) {
5676 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005677
Walter Dörwald79e913e2007-05-12 11:08:06 +00005678 /* Escape backslashes */
5679 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 *p++ = '\\';
5681 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005682 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005683 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005684
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005685#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005686 /* Map 21-bit characters to '\U00xxxxxx' */
5687 else if (ch >= 0x10000) {
5688 *p++ = '\\';
5689 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005690 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5691 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5692 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5693 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5694 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5695 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5696 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5697 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005699 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005700#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5702 else if (ch >= 0xD800 && ch < 0xDC00) {
5703 Py_UNICODE ch2;
5704 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 ch2 = *s++;
5707 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005708 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5710 *p++ = '\\';
5711 *p++ = 'U';
5712 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5713 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5714 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5715 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5716 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5717 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5718 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5719 *p++ = hexdigits[ucs & 0x0000000F];
5720 continue;
5721 }
5722 /* Fall through: isolated surrogates are copied as-is */
5723 s--;
5724 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005725 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005726#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005729 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 *p++ = '\\';
5731 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005732 *p++ = hexdigits[(ch >> 12) & 0x000F];
5733 *p++ = hexdigits[(ch >> 8) & 0x000F];
5734 *p++ = hexdigits[(ch >> 4) & 0x000F];
5735 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005737
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 /* Map special whitespace to '\t', \n', '\r' */
5739 else if (ch == '\t') {
5740 *p++ = '\\';
5741 *p++ = 't';
5742 }
5743 else if (ch == '\n') {
5744 *p++ = '\\';
5745 *p++ = 'n';
5746 }
5747 else if (ch == '\r') {
5748 *p++ = '\\';
5749 *p++ = 'r';
5750 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005751
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005752 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005753 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005755 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005756 *p++ = hexdigits[(ch >> 4) & 0x000F];
5757 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005758 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005759
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 /* Copy everything else as-is */
5761 else
5762 *p++ = (char) ch;
5763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005765 assert(p - PyBytes_AS_STRING(repr) > 0);
5766 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5767 return NULL;
5768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769}
5770
Alexander Belopolsky40018472011-02-26 01:02:56 +00005771PyObject *
5772PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005774 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 if (!PyUnicode_Check(unicode)) {
5776 PyErr_BadArgument();
5777 return NULL;
5778 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005779 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5780 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005781 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
5784/* --- Raw Unicode Escape Codec ------------------------------------------- */
5785
Alexander Belopolsky40018472011-02-26 01:02:56 +00005786PyObject *
5787PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005788 Py_ssize_t size,
5789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t startinpos;
5793 Py_ssize_t endinpos;
5794 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 const char *end;
5798 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 PyObject *errorHandler = NULL;
5800 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 /* Escaped strings will always be longer than the resulting
5803 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 length after conversion to the true value. (But decoding error
5805 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 v = _PyUnicode_New(size);
5807 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 end = s + size;
5813 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 unsigned char c;
5815 Py_UCS4 x;
5816 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005817 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 /* Non-escape characters are interpreted as Unicode ordinals */
5820 if (*s != '\\') {
5821 *p++ = (unsigned char)*s++;
5822 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005823 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 startinpos = s-starts;
5825
5826 /* \u-escapes are only interpreted iff the number of leading
5827 backslashes if odd */
5828 bs = s;
5829 for (;s < end;) {
5830 if (*s != '\\')
5831 break;
5832 *p++ = (unsigned char)*s++;
5833 }
5834 if (((s - bs) & 1) == 0 ||
5835 s >= end ||
5836 (*s != 'u' && *s != 'U')) {
5837 continue;
5838 }
5839 p--;
5840 count = *s=='u' ? 4 : 8;
5841 s++;
5842
5843 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5844 outpos = p-PyUnicode_AS_UNICODE(v);
5845 for (x = 0, i = 0; i < count; ++i, ++s) {
5846 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005847 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 endinpos = s-starts;
5849 if (unicode_decode_call_errorhandler(
5850 errors, &errorHandler,
5851 "rawunicodeescape", "truncated \\uXXXX",
5852 &starts, &end, &startinpos, &endinpos, &exc, &s,
5853 &v, &outpos, &p))
5854 goto onError;
5855 goto nextByte;
5856 }
5857 x = (x<<4) & ~0xF;
5858 if (c >= '0' && c <= '9')
5859 x += c - '0';
5860 else if (c >= 'a' && c <= 'f')
5861 x += 10 + c - 'a';
5862 else
5863 x += 10 + c - 'A';
5864 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005865 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 /* UCS-2 character */
5867 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005868 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 /* UCS-4 character. Either store directly, or as
5870 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005871#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005873#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 x -= 0x10000L;
5875 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5876 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005877#endif
5878 } else {
5879 endinpos = s-starts;
5880 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005881 if (unicode_decode_call_errorhandler(
5882 errors, &errorHandler,
5883 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 &starts, &end, &startinpos, &endinpos, &exc, &s,
5885 &v, &outpos, &p))
5886 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 nextByte:
5889 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005891 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 Py_XDECREF(errorHandler);
5894 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005895 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 Py_DECREF(v);
5897 return NULL;
5898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005900
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 Py_XDECREF(errorHandler);
5904 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 return NULL;
5906}
5907
Alexander Belopolsky40018472011-02-26 01:02:56 +00005908PyObject *
5909PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005910 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 char *p;
5914 char *q;
5915
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005916#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005917 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005918#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005919 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005920#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005921
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005922 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005924
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 if (repr == NULL)
5927 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005928 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 while (size-- > 0) {
5933 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005934#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* Map 32-bit characters to '\Uxxxxxxxx' */
5936 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005937 *p++ = '\\';
5938 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005939 *p++ = hexdigits[(ch >> 28) & 0xf];
5940 *p++ = hexdigits[(ch >> 24) & 0xf];
5941 *p++ = hexdigits[(ch >> 20) & 0xf];
5942 *p++ = hexdigits[(ch >> 16) & 0xf];
5943 *p++ = hexdigits[(ch >> 12) & 0xf];
5944 *p++ = hexdigits[(ch >> 8) & 0xf];
5945 *p++ = hexdigits[(ch >> 4) & 0xf];
5946 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005947 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005948 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005949#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5951 if (ch >= 0xD800 && ch < 0xDC00) {
5952 Py_UNICODE ch2;
5953 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005954
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 ch2 = *s++;
5956 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005957 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5959 *p++ = '\\';
5960 *p++ = 'U';
5961 *p++ = hexdigits[(ucs >> 28) & 0xf];
5962 *p++ = hexdigits[(ucs >> 24) & 0xf];
5963 *p++ = hexdigits[(ucs >> 20) & 0xf];
5964 *p++ = hexdigits[(ucs >> 16) & 0xf];
5965 *p++ = hexdigits[(ucs >> 12) & 0xf];
5966 *p++ = hexdigits[(ucs >> 8) & 0xf];
5967 *p++ = hexdigits[(ucs >> 4) & 0xf];
5968 *p++ = hexdigits[ucs & 0xf];
5969 continue;
5970 }
5971 /* Fall through: isolated surrogates are copied as-is */
5972 s--;
5973 size++;
5974 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* Map 16-bit characters to '\uxxxx' */
5977 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 *p++ = '\\';
5979 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005980 *p++ = hexdigits[(ch >> 12) & 0xf];
5981 *p++ = hexdigits[(ch >> 8) & 0xf];
5982 *p++ = hexdigits[(ch >> 4) & 0xf];
5983 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 /* Copy everything else as-is */
5986 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 *p++ = (char) ch;
5988 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005989 size = p - q;
5990
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005991 assert(size > 0);
5992 if (_PyBytes_Resize(&repr, size) < 0)
5993 return NULL;
5994 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995}
5996
Alexander Belopolsky40018472011-02-26 01:02:56 +00005997PyObject *
5998PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006000 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006002 PyErr_BadArgument();
6003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006005 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6006 PyUnicode_GET_SIZE(unicode));
6007
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006008 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009}
6010
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006011/* --- Unicode Internal Codec ------------------------------------------- */
6012
Alexander Belopolsky40018472011-02-26 01:02:56 +00006013PyObject *
6014_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006015 Py_ssize_t size,
6016 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006017{
6018 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 Py_ssize_t startinpos;
6020 Py_ssize_t endinpos;
6021 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006022 PyUnicodeObject *v;
6023 Py_UNICODE *p;
6024 const char *end;
6025 const char *reason;
6026 PyObject *errorHandler = NULL;
6027 PyObject *exc = NULL;
6028
Neal Norwitzd43069c2006-01-08 01:12:10 +00006029#ifdef Py_UNICODE_WIDE
6030 Py_UNICODE unimax = PyUnicode_GetMax();
6031#endif
6032
Thomas Wouters89f507f2006-12-13 04:49:30 +00006033 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006034 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6035 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006037 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6038 as string was created with the old API. */
6039 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006041 p = PyUnicode_AS_UNICODE(v);
6042 end = s + size;
6043
6044 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006045 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 /* We have to sanity check the raw data, otherwise doom looms for
6047 some malformed UCS-4 data. */
6048 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006049#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006050 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006051#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006052 end-s < Py_UNICODE_SIZE
6053 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006055 startinpos = s - starts;
6056 if (end-s < Py_UNICODE_SIZE) {
6057 endinpos = end-starts;
6058 reason = "truncated input";
6059 }
6060 else {
6061 endinpos = s - starts + Py_UNICODE_SIZE;
6062 reason = "illegal code point (> 0x10FFFF)";
6063 }
6064 outpos = p - PyUnicode_AS_UNICODE(v);
6065 if (unicode_decode_call_errorhandler(
6066 errors, &errorHandler,
6067 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006068 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006069 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006070 goto onError;
6071 }
6072 }
6073 else {
6074 p++;
6075 s += Py_UNICODE_SIZE;
6076 }
6077 }
6078
Victor Stinnerfe226c02011-10-03 03:52:20 +02006079 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006080 goto onError;
6081 Py_XDECREF(errorHandler);
6082 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006083 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006084 Py_DECREF(v);
6085 return NULL;
6086 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006087 return (PyObject *)v;
6088
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006090 Py_XDECREF(v);
6091 Py_XDECREF(errorHandler);
6092 Py_XDECREF(exc);
6093 return NULL;
6094}
6095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096/* --- Latin-1 Codec ------------------------------------------------------ */
6097
Alexander Belopolsky40018472011-02-26 01:02:56 +00006098PyObject *
6099PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006100 Py_ssize_t size,
6101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006104 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006108static void
6109make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006110 const char *encoding,
6111 const Py_UNICODE *unicode, Py_ssize_t size,
6112 Py_ssize_t startpos, Py_ssize_t endpos,
6113 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 *exceptionObject = PyUnicodeEncodeError_Create(
6117 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 }
6119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6121 goto onError;
6122 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6123 goto onError;
6124 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6125 goto onError;
6126 return;
6127 onError:
6128 Py_DECREF(*exceptionObject);
6129 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 }
6131}
6132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006134static void
6135raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006136 const char *encoding,
6137 const Py_UNICODE *unicode, Py_ssize_t size,
6138 Py_ssize_t startpos, Py_ssize_t endpos,
6139 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140{
6141 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145}
6146
6147/* error handling callback helper:
6148 build arguments, call the callback and check the arguments,
6149 put the result into newpos and return the replacement string, which
6150 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006151static PyObject *
6152unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006153 PyObject **errorHandler,
6154 const char *encoding, const char *reason,
6155 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6156 Py_ssize_t startpos, Py_ssize_t endpos,
6157 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006159 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160
6161 PyObject *restuple;
6162 PyObject *resunicode;
6163
6164 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006166 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 }
6169
6170 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174
6175 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006180 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 Py_DECREF(restuple);
6182 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006184 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 &resunicode, newpos)) {
6186 Py_DECREF(restuple);
6187 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006189 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6190 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6191 Py_DECREF(restuple);
6192 return NULL;
6193 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006196 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6198 Py_DECREF(restuple);
6199 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 Py_INCREF(resunicode);
6202 Py_DECREF(restuple);
6203 return resunicode;
6204}
6205
Alexander Belopolsky40018472011-02-26 01:02:56 +00006206static PyObject *
6207unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006208 Py_ssize_t size,
6209 const char *errors,
6210 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211{
6212 /* output object */
6213 PyObject *res;
6214 /* pointers to the beginning and end+1 of input */
6215 const Py_UNICODE *startp = p;
6216 const Py_UNICODE *endp = p + size;
6217 /* pointer to the beginning of the unencodable characters */
6218 /* const Py_UNICODE *badp = NULL; */
6219 /* pointer into the output */
6220 char *str;
6221 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006222 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006223 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6224 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 PyObject *errorHandler = NULL;
6226 PyObject *exc = NULL;
6227 /* the following variable is used for caching string comparisons
6228 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6229 int known_errorHandler = -1;
6230
6231 /* allocate enough for a simple encoding without
6232 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006233 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006234 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006235 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006237 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006238 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006239 ressize = size;
6240
6241 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 /* can we encode this? */
6245 if (c<limit) {
6246 /* no overflow check, because we know that the space is enough */
6247 *str++ = (char)c;
6248 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 else {
6251 Py_ssize_t unicodepos = p-startp;
6252 Py_ssize_t requiredsize;
6253 PyObject *repunicode;
6254 Py_ssize_t repsize;
6255 Py_ssize_t newpos;
6256 Py_ssize_t respos;
6257 Py_UNICODE *uni2;
6258 /* startpos for collecting unencodable chars */
6259 const Py_UNICODE *collstart = p;
6260 const Py_UNICODE *collend = p;
6261 /* find all unecodable characters */
6262 while ((collend < endp) && ((*collend)>=limit))
6263 ++collend;
6264 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6265 if (known_errorHandler==-1) {
6266 if ((errors==NULL) || (!strcmp(errors, "strict")))
6267 known_errorHandler = 1;
6268 else if (!strcmp(errors, "replace"))
6269 known_errorHandler = 2;
6270 else if (!strcmp(errors, "ignore"))
6271 known_errorHandler = 3;
6272 else if (!strcmp(errors, "xmlcharrefreplace"))
6273 known_errorHandler = 4;
6274 else
6275 known_errorHandler = 0;
6276 }
6277 switch (known_errorHandler) {
6278 case 1: /* strict */
6279 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6280 goto onError;
6281 case 2: /* replace */
6282 while (collstart++<collend)
6283 *str++ = '?'; /* fall through */
6284 case 3: /* ignore */
6285 p = collend;
6286 break;
6287 case 4: /* xmlcharrefreplace */
6288 respos = str - PyBytes_AS_STRING(res);
6289 /* determine replacement size (temporarily (mis)uses p) */
6290 for (p = collstart, repsize = 0; p < collend; ++p) {
6291 if (*p<10)
6292 repsize += 2+1+1;
6293 else if (*p<100)
6294 repsize += 2+2+1;
6295 else if (*p<1000)
6296 repsize += 2+3+1;
6297 else if (*p<10000)
6298 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006299#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 else
6301 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006302#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 else if (*p<100000)
6304 repsize += 2+5+1;
6305 else if (*p<1000000)
6306 repsize += 2+6+1;
6307 else
6308 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006309#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 }
6311 requiredsize = respos+repsize+(endp-collend);
6312 if (requiredsize > ressize) {
6313 if (requiredsize<2*ressize)
6314 requiredsize = 2*ressize;
6315 if (_PyBytes_Resize(&res, requiredsize))
6316 goto onError;
6317 str = PyBytes_AS_STRING(res) + respos;
6318 ressize = requiredsize;
6319 }
6320 /* generate replacement (temporarily (mis)uses p) */
6321 for (p = collstart; p < collend; ++p) {
6322 str += sprintf(str, "&#%d;", (int)*p);
6323 }
6324 p = collend;
6325 break;
6326 default:
6327 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6328 encoding, reason, startp, size, &exc,
6329 collstart-startp, collend-startp, &newpos);
6330 if (repunicode == NULL)
6331 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006332 if (PyBytes_Check(repunicode)) {
6333 /* Directly copy bytes result to output. */
6334 repsize = PyBytes_Size(repunicode);
6335 if (repsize > 1) {
6336 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006337 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006338 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6339 Py_DECREF(repunicode);
6340 goto onError;
6341 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006342 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006343 ressize += repsize-1;
6344 }
6345 memcpy(str, PyBytes_AsString(repunicode), repsize);
6346 str += repsize;
6347 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006348 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006349 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 /* need more space? (at least enough for what we
6352 have+the replacement+the rest of the string, so
6353 we won't have to check space for encodable characters) */
6354 respos = str - PyBytes_AS_STRING(res);
6355 repsize = PyUnicode_GET_SIZE(repunicode);
6356 requiredsize = respos+repsize+(endp-collend);
6357 if (requiredsize > ressize) {
6358 if (requiredsize<2*ressize)
6359 requiredsize = 2*ressize;
6360 if (_PyBytes_Resize(&res, requiredsize)) {
6361 Py_DECREF(repunicode);
6362 goto onError;
6363 }
6364 str = PyBytes_AS_STRING(res) + respos;
6365 ressize = requiredsize;
6366 }
6367 /* check if there is anything unencodable in the replacement
6368 and copy it to the output */
6369 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6370 c = *uni2;
6371 if (c >= limit) {
6372 raise_encode_exception(&exc, encoding, startp, size,
6373 unicodepos, unicodepos+1, reason);
6374 Py_DECREF(repunicode);
6375 goto onError;
6376 }
6377 *str = (char)c;
6378 }
6379 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006380 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006382 }
6383 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006384 /* Resize if we allocated to much */
6385 size = str - PyBytes_AS_STRING(res);
6386 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006387 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006388 if (_PyBytes_Resize(&res, size) < 0)
6389 goto onError;
6390 }
6391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006394 return res;
6395
6396 onError:
6397 Py_XDECREF(res);
6398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
6400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401}
6402
Alexander Belopolsky40018472011-02-26 01:02:56 +00006403PyObject *
6404PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006405 Py_ssize_t size,
6406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409}
6410
Alexander Belopolsky40018472011-02-26 01:02:56 +00006411PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006412_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
6414 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 PyErr_BadArgument();
6416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 if (PyUnicode_READY(unicode) == -1)
6419 return NULL;
6420 /* Fast path: if it is a one-byte string, construct
6421 bytes object directly. */
6422 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6423 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6424 PyUnicode_GET_LENGTH(unicode));
6425 /* Non-Latin-1 characters present. Defer to above function to
6426 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006429 errors);
6430}
6431
6432PyObject*
6433PyUnicode_AsLatin1String(PyObject *unicode)
6434{
6435 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436}
6437
6438/* --- 7-bit ASCII Codec -------------------------------------------------- */
6439
Alexander Belopolsky40018472011-02-26 01:02:56 +00006440PyObject *
6441PyUnicode_DecodeASCII(const char *s,
6442 Py_ssize_t size,
6443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 PyUnicodeObject *v;
6447 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006448 Py_ssize_t startinpos;
6449 Py_ssize_t endinpos;
6450 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006452 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 PyObject *errorHandler = NULL;
6454 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006455 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006458 if (size == 1 && *(unsigned char*)s < 128)
6459 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6460
6461 /* Fast path. Assume the input actually *is* ASCII, and allocate
6462 a single-block Unicode object with that assumption. If there is
6463 an error, drop the object and start over. */
6464 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6465 if (v == NULL)
6466 goto onError;
6467 d = PyUnicode_1BYTE_DATA(v);
6468 for (i = 0; i < size; i++) {
6469 unsigned char ch = ((unsigned char*)s)[i];
6470 if (ch < 128)
6471 d[i] = ch;
6472 else
6473 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006475 if (i == size)
6476 return (PyObject*)v;
6477 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 v = _PyUnicode_New(size);
6480 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 e = s + size;
6486 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 register unsigned char c = (unsigned char)*s;
6488 if (c < 128) {
6489 *p++ = c;
6490 ++s;
6491 }
6492 else {
6493 startinpos = s-starts;
6494 endinpos = startinpos + 1;
6495 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6496 if (unicode_decode_call_errorhandler(
6497 errors, &errorHandler,
6498 "ascii", "ordinal not in range(128)",
6499 &starts, &e, &startinpos, &endinpos, &exc, &s,
6500 &v, &outpos, &p))
6501 goto onError;
6502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006504 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006505 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006507 Py_XDECREF(errorHandler);
6508 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006509 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006510 Py_DECREF(v);
6511 return NULL;
6512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517 Py_XDECREF(errorHandler);
6518 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 return NULL;
6520}
6521
Alexander Belopolsky40018472011-02-26 01:02:56 +00006522PyObject *
6523PyUnicode_EncodeASCII(const Py_UNICODE *p,
6524 Py_ssize_t size,
6525 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528}
6529
Alexander Belopolsky40018472011-02-26 01:02:56 +00006530PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006531_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532{
6533 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 PyErr_BadArgument();
6535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006537 if (PyUnicode_READY(unicode) == -1)
6538 return NULL;
6539 /* Fast path: if it is an ASCII-only string, construct bytes object
6540 directly. Else defer to above function to raise the exception. */
6541 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6542 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6543 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006546 errors);
6547}
6548
6549PyObject *
6550PyUnicode_AsASCIIString(PyObject *unicode)
6551{
6552 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553}
6554
Victor Stinner99b95382011-07-04 14:23:54 +02006555#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006556
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006557/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006558
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006559#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006560#define NEED_RETRY
6561#endif
6562
6563/* XXX This code is limited to "true" double-byte encodings, as
6564 a) it assumes an incomplete character consists of a single byte, and
6565 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static int
6569is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006570{
6571 const char *curr = s + offset;
6572
6573 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 const char *prev = CharPrev(s, curr);
6575 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576 }
6577 return 0;
6578}
6579
6580/*
6581 * Decode MBCS string into unicode object. If 'final' is set, converts
6582 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6583 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006584static int
6585decode_mbcs(PyUnicodeObject **v,
6586 const char *s, /* MBCS string */
6587 int size, /* sizeof MBCS string */
6588 int final,
6589 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590{
6591 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006592 Py_ssize_t n;
6593 DWORD usize;
6594 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006595
6596 assert(size >= 0);
6597
Victor Stinner554f3f02010-06-16 23:33:54 +00006598 /* check and handle 'errors' arg */
6599 if (errors==NULL || strcmp(errors, "strict")==0)
6600 flags = MB_ERR_INVALID_CHARS;
6601 else if (strcmp(errors, "ignore")==0)
6602 flags = 0;
6603 else {
6604 PyErr_Format(PyExc_ValueError,
6605 "mbcs encoding does not support errors='%s'",
6606 errors);
6607 return -1;
6608 }
6609
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006610 /* Skip trailing lead-byte unless 'final' is set */
6611 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613
6614 /* First get the size of the result */
6615 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006616 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6617 if (usize==0)
6618 goto mbcs_decode_error;
6619 } else
6620 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006621
6622 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 /* Create unicode object */
6624 *v = _PyUnicode_New(usize);
6625 if (*v == NULL)
6626 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006627 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006628 }
6629 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 /* Extend unicode object */
6631 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006632 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634 }
6635
6636 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006637 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006639 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6640 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006642 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006643 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006644
6645mbcs_decode_error:
6646 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6647 we raise a UnicodeDecodeError - else it is a 'generic'
6648 windows error
6649 */
6650 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6651 /* Ideally, we should get reason from FormatMessage - this
6652 is the Windows 2000 English version of the message
6653 */
6654 PyObject *exc = NULL;
6655 const char *reason = "No mapping for the Unicode character exists "
6656 "in the target multi-byte code page.";
6657 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6658 if (exc != NULL) {
6659 PyCodec_StrictErrors(exc);
6660 Py_DECREF(exc);
6661 }
6662 } else {
6663 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6664 }
6665 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666}
6667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668PyObject *
6669PyUnicode_DecodeMBCSStateful(const char *s,
6670 Py_ssize_t size,
6671 const char *errors,
6672 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673{
6674 PyUnicodeObject *v = NULL;
6675 int done;
6676
6677 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006679
6680#ifdef NEED_RETRY
6681 retry:
6682 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006683 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006684 else
6685#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006686 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006687
6688 if (done < 0) {
6689 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 }
6692
6693 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006695
6696#ifdef NEED_RETRY
6697 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 s += done;
6699 size -= done;
6700 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 }
6702#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006703 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006704 Py_DECREF(v);
6705 return NULL;
6706 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707 return (PyObject *)v;
6708}
6709
Alexander Belopolsky40018472011-02-26 01:02:56 +00006710PyObject *
6711PyUnicode_DecodeMBCS(const char *s,
6712 Py_ssize_t size,
6713 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006714{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6716}
6717
6718/*
6719 * Convert unicode into string object (MBCS).
6720 * Returns 0 if succeed, -1 otherwise.
6721 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006722static int
6723encode_mbcs(PyObject **repr,
6724 const Py_UNICODE *p, /* unicode */
6725 int size, /* size of unicode */
6726 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727{
Victor Stinner554f3f02010-06-16 23:33:54 +00006728 BOOL usedDefaultChar = FALSE;
6729 BOOL *pusedDefaultChar;
6730 int mbcssize;
6731 Py_ssize_t n;
6732 PyObject *exc = NULL;
6733 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006734
6735 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006736
Victor Stinner554f3f02010-06-16 23:33:54 +00006737 /* check and handle 'errors' arg */
6738 if (errors==NULL || strcmp(errors, "strict")==0) {
6739 flags = WC_NO_BEST_FIT_CHARS;
6740 pusedDefaultChar = &usedDefaultChar;
6741 } else if (strcmp(errors, "replace")==0) {
6742 flags = 0;
6743 pusedDefaultChar = NULL;
6744 } else {
6745 PyErr_Format(PyExc_ValueError,
6746 "mbcs encoding does not support errors='%s'",
6747 errors);
6748 return -1;
6749 }
6750
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006751 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006753 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6754 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 if (mbcssize == 0) {
6756 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6757 return -1;
6758 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006759 /* If we used a default char, then we failed! */
6760 if (pusedDefaultChar && *pusedDefaultChar)
6761 goto mbcs_encode_error;
6762 } else {
6763 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006764 }
6765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* Create string object */
6768 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6769 if (*repr == NULL)
6770 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006771 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006772 }
6773 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* Extend string object */
6775 n = PyBytes_Size(*repr);
6776 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6777 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778 }
6779
6780 /* Do the conversion */
6781 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006783 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6784 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6786 return -1;
6787 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006788 if (pusedDefaultChar && *pusedDefaultChar)
6789 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006792
6793mbcs_encode_error:
6794 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6795 Py_XDECREF(exc);
6796 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006797}
6798
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799PyObject *
6800PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6801 Py_ssize_t size,
6802 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006803{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804 PyObject *repr = NULL;
6805 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006806
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006807#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006810 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811 else
6812#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006813 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006814
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 Py_XDECREF(repr);
6817 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006818 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
6820#ifdef NEED_RETRY
6821 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 p += INT_MAX;
6823 size -= INT_MAX;
6824 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825 }
6826#endif
6827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006828 return repr;
6829}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006830
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831PyObject *
6832PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006833{
6834 if (!PyUnicode_Check(unicode)) {
6835 PyErr_BadArgument();
6836 return NULL;
6837 }
6838 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 PyUnicode_GET_SIZE(unicode),
6840 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006841}
6842
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843#undef NEED_RETRY
6844
Victor Stinner99b95382011-07-04 14:23:54 +02006845#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847/* --- Character Mapping Codec -------------------------------------------- */
6848
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849PyObject *
6850PyUnicode_DecodeCharmap(const char *s,
6851 Py_ssize_t size,
6852 PyObject *mapping,
6853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006856 Py_ssize_t startinpos;
6857 Py_ssize_t endinpos;
6858 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 PyUnicodeObject *v;
6861 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006862 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 PyObject *errorHandler = NULL;
6864 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006865 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006866 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006867
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 /* Default to Latin-1 */
6869 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871
6872 v = _PyUnicode_New(size);
6873 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006879 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 mapstring = PyUnicode_AS_UNICODE(mapping);
6881 maplen = PyUnicode_GET_SIZE(mapping);
6882 while (s < e) {
6883 unsigned char ch = *s;
6884 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 if (ch < maplen)
6887 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 if (x == 0xfffe) {
6890 /* undefined mapping */
6891 outpos = p-PyUnicode_AS_UNICODE(v);
6892 startinpos = s-starts;
6893 endinpos = startinpos+1;
6894 if (unicode_decode_call_errorhandler(
6895 errors, &errorHandler,
6896 "charmap", "character maps to <undefined>",
6897 &starts, &e, &startinpos, &endinpos, &exc, &s,
6898 &v, &outpos, &p)) {
6899 goto onError;
6900 }
6901 continue;
6902 }
6903 *p++ = x;
6904 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006905 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006906 }
6907 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 while (s < e) {
6909 unsigned char ch = *s;
6910 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006911
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6913 w = PyLong_FromLong((long)ch);
6914 if (w == NULL)
6915 goto onError;
6916 x = PyObject_GetItem(mapping, w);
6917 Py_DECREF(w);
6918 if (x == NULL) {
6919 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6920 /* No mapping found means: mapping is undefined. */
6921 PyErr_Clear();
6922 x = Py_None;
6923 Py_INCREF(x);
6924 } else
6925 goto onError;
6926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 /* Apply mapping */
6929 if (PyLong_Check(x)) {
6930 long value = PyLong_AS_LONG(x);
6931 if (value < 0 || value > 65535) {
6932 PyErr_SetString(PyExc_TypeError,
6933 "character mapping must be in range(65536)");
6934 Py_DECREF(x);
6935 goto onError;
6936 }
6937 *p++ = (Py_UNICODE)value;
6938 }
6939 else if (x == Py_None) {
6940 /* undefined mapping */
6941 outpos = p-PyUnicode_AS_UNICODE(v);
6942 startinpos = s-starts;
6943 endinpos = startinpos+1;
6944 if (unicode_decode_call_errorhandler(
6945 errors, &errorHandler,
6946 "charmap", "character maps to <undefined>",
6947 &starts, &e, &startinpos, &endinpos, &exc, &s,
6948 &v, &outpos, &p)) {
6949 Py_DECREF(x);
6950 goto onError;
6951 }
6952 Py_DECREF(x);
6953 continue;
6954 }
6955 else if (PyUnicode_Check(x)) {
6956 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006957
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 if (targetsize == 1)
6959 /* 1-1 mapping */
6960 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006961
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 else if (targetsize > 1) {
6963 /* 1-n mapping */
6964 if (targetsize > extrachars) {
6965 /* resize first */
6966 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6967 Py_ssize_t needed = (targetsize - extrachars) + \
6968 (targetsize << 2);
6969 extrachars += needed;
6970 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006971 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 PyUnicode_GET_SIZE(v) + needed) < 0) {
6973 Py_DECREF(x);
6974 goto onError;
6975 }
6976 p = PyUnicode_AS_UNICODE(v) + oldpos;
6977 }
6978 Py_UNICODE_COPY(p,
6979 PyUnicode_AS_UNICODE(x),
6980 targetsize);
6981 p += targetsize;
6982 extrachars -= targetsize;
6983 }
6984 /* 1-0 mapping: skip the character */
6985 }
6986 else {
6987 /* wrong return value */
6988 PyErr_SetString(PyExc_TypeError,
6989 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 Py_DECREF(x);
6991 goto onError;
6992 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 Py_DECREF(x);
6994 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 }
6997 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006998 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(errorHandler);
7001 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007002 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007003 Py_DECREF(v);
7004 return NULL;
7005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007007
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009 Py_XDECREF(errorHandler);
7010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 Py_XDECREF(v);
7012 return NULL;
7013}
7014
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007015/* Charmap encoding: the lookup table */
7016
Alexander Belopolsky40018472011-02-26 01:02:56 +00007017struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 PyObject_HEAD
7019 unsigned char level1[32];
7020 int count2, count3;
7021 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007022};
7023
7024static PyObject*
7025encoding_map_size(PyObject *obj, PyObject* args)
7026{
7027 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007028 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007030}
7031
7032static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007033 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 PyDoc_STR("Return the size (in bytes) of this object") },
7035 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007036};
7037
7038static void
7039encoding_map_dealloc(PyObject* o)
7040{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007042}
7043
7044static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007045 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 "EncodingMap", /*tp_name*/
7047 sizeof(struct encoding_map), /*tp_basicsize*/
7048 0, /*tp_itemsize*/
7049 /* methods */
7050 encoding_map_dealloc, /*tp_dealloc*/
7051 0, /*tp_print*/
7052 0, /*tp_getattr*/
7053 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007054 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 0, /*tp_repr*/
7056 0, /*tp_as_number*/
7057 0, /*tp_as_sequence*/
7058 0, /*tp_as_mapping*/
7059 0, /*tp_hash*/
7060 0, /*tp_call*/
7061 0, /*tp_str*/
7062 0, /*tp_getattro*/
7063 0, /*tp_setattro*/
7064 0, /*tp_as_buffer*/
7065 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7066 0, /*tp_doc*/
7067 0, /*tp_traverse*/
7068 0, /*tp_clear*/
7069 0, /*tp_richcompare*/
7070 0, /*tp_weaklistoffset*/
7071 0, /*tp_iter*/
7072 0, /*tp_iternext*/
7073 encoding_map_methods, /*tp_methods*/
7074 0, /*tp_members*/
7075 0, /*tp_getset*/
7076 0, /*tp_base*/
7077 0, /*tp_dict*/
7078 0, /*tp_descr_get*/
7079 0, /*tp_descr_set*/
7080 0, /*tp_dictoffset*/
7081 0, /*tp_init*/
7082 0, /*tp_alloc*/
7083 0, /*tp_new*/
7084 0, /*tp_free*/
7085 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007086};
7087
7088PyObject*
7089PyUnicode_BuildEncodingMap(PyObject* string)
7090{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007091 PyObject *result;
7092 struct encoding_map *mresult;
7093 int i;
7094 int need_dict = 0;
7095 unsigned char level1[32];
7096 unsigned char level2[512];
7097 unsigned char *mlevel1, *mlevel2, *mlevel3;
7098 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007099 int kind;
7100 void *data;
7101 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007104 PyErr_BadArgument();
7105 return NULL;
7106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007107 kind = PyUnicode_KIND(string);
7108 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007109 memset(level1, 0xFF, sizeof level1);
7110 memset(level2, 0xFF, sizeof level2);
7111
7112 /* If there isn't a one-to-one mapping of NULL to \0,
7113 or if there are non-BMP characters, we need to use
7114 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007116 need_dict = 1;
7117 for (i = 1; i < 256; i++) {
7118 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007119 ch = PyUnicode_READ(kind, data, i);
7120 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007121 need_dict = 1;
7122 break;
7123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007124 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007125 /* unmapped character */
7126 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007127 l1 = ch >> 11;
7128 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007129 if (level1[l1] == 0xFF)
7130 level1[l1] = count2++;
7131 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007132 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007133 }
7134
7135 if (count2 >= 0xFF || count3 >= 0xFF)
7136 need_dict = 1;
7137
7138 if (need_dict) {
7139 PyObject *result = PyDict_New();
7140 PyObject *key, *value;
7141 if (!result)
7142 return NULL;
7143 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007144 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007145 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007146 if (!key || !value)
7147 goto failed1;
7148 if (PyDict_SetItem(result, key, value) == -1)
7149 goto failed1;
7150 Py_DECREF(key);
7151 Py_DECREF(value);
7152 }
7153 return result;
7154 failed1:
7155 Py_XDECREF(key);
7156 Py_XDECREF(value);
7157 Py_DECREF(result);
7158 return NULL;
7159 }
7160
7161 /* Create a three-level trie */
7162 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7163 16*count2 + 128*count3 - 1);
7164 if (!result)
7165 return PyErr_NoMemory();
7166 PyObject_Init(result, &EncodingMapType);
7167 mresult = (struct encoding_map*)result;
7168 mresult->count2 = count2;
7169 mresult->count3 = count3;
7170 mlevel1 = mresult->level1;
7171 mlevel2 = mresult->level23;
7172 mlevel3 = mresult->level23 + 16*count2;
7173 memcpy(mlevel1, level1, 32);
7174 memset(mlevel2, 0xFF, 16*count2);
7175 memset(mlevel3, 0, 128*count3);
7176 count3 = 0;
7177 for (i = 1; i < 256; i++) {
7178 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007179 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180 /* unmapped character */
7181 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007182 o1 = PyUnicode_READ(kind, data, i)>>11;
7183 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007184 i2 = 16*mlevel1[o1] + o2;
7185 if (mlevel2[i2] == 0xFF)
7186 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007187 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188 i3 = 128*mlevel2[i2] + o3;
7189 mlevel3[i3] = i;
7190 }
7191 return result;
7192}
7193
7194static int
7195encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7196{
7197 struct encoding_map *map = (struct encoding_map*)mapping;
7198 int l1 = c>>11;
7199 int l2 = (c>>7) & 0xF;
7200 int l3 = c & 0x7F;
7201 int i;
7202
7203#ifdef Py_UNICODE_WIDE
7204 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007206 }
7207#endif
7208 if (c == 0)
7209 return 0;
7210 /* level 1*/
7211 i = map->level1[l1];
7212 if (i == 0xFF) {
7213 return -1;
7214 }
7215 /* level 2*/
7216 i = map->level23[16*i+l2];
7217 if (i == 0xFF) {
7218 return -1;
7219 }
7220 /* level 3 */
7221 i = map->level23[16*map->count2 + 128*i + l3];
7222 if (i == 0) {
7223 return -1;
7224 }
7225 return i;
7226}
7227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228/* Lookup the character ch in the mapping. If the character
7229 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007230 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007231static PyObject *
7232charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233{
Christian Heimes217cfd12007-12-02 14:31:20 +00007234 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235 PyObject *x;
7236
7237 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239 x = PyObject_GetItem(mapping, w);
7240 Py_DECREF(w);
7241 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7243 /* No mapping found means: mapping is undefined. */
7244 PyErr_Clear();
7245 x = Py_None;
7246 Py_INCREF(x);
7247 return x;
7248 } else
7249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007251 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007253 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 long value = PyLong_AS_LONG(x);
7255 if (value < 0 || value > 255) {
7256 PyErr_SetString(PyExc_TypeError,
7257 "character mapping must be in range(256)");
7258 Py_DECREF(x);
7259 return NULL;
7260 }
7261 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007263 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 /* wrong return value */
7267 PyErr_Format(PyExc_TypeError,
7268 "character mapping must return integer, bytes or None, not %.400s",
7269 x->ob_type->tp_name);
7270 Py_DECREF(x);
7271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 }
7273}
7274
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007276charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007277{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7279 /* exponentially overallocate to minimize reallocations */
7280 if (requiredsize < 2*outsize)
7281 requiredsize = 2*outsize;
7282 if (_PyBytes_Resize(outobj, requiredsize))
7283 return -1;
7284 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007285}
7286
Benjamin Peterson14339b62009-01-31 16:36:08 +00007287typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007290/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007291 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292 space is available. Return a new reference to the object that
7293 was put in the output buffer, or Py_None, if the mapping was undefined
7294 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007295 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007296static charmapencode_result
7297charmapencode_output(Py_UNICODE c, PyObject *mapping,
7298 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007300 PyObject *rep;
7301 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007302 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303
Christian Heimes90aa7642007-12-19 02:45:37 +00007304 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007305 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007307 if (res == -1)
7308 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 if (outsize<requiredsize)
7310 if (charmapencode_resize(outobj, outpos, requiredsize))
7311 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007312 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 outstart[(*outpos)++] = (char)res;
7314 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007315 }
7316
7317 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007320 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 Py_DECREF(rep);
7322 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007323 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 if (PyLong_Check(rep)) {
7325 Py_ssize_t requiredsize = *outpos+1;
7326 if (outsize<requiredsize)
7327 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7328 Py_DECREF(rep);
7329 return enc_EXCEPTION;
7330 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007331 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 else {
7335 const char *repchars = PyBytes_AS_STRING(rep);
7336 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7337 Py_ssize_t requiredsize = *outpos+repsize;
7338 if (outsize<requiredsize)
7339 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7340 Py_DECREF(rep);
7341 return enc_EXCEPTION;
7342 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007343 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 memcpy(outstart + *outpos, repchars, repsize);
7345 *outpos += repsize;
7346 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007347 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007348 Py_DECREF(rep);
7349 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350}
7351
7352/* handle an error in PyUnicode_EncodeCharmap
7353 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354static int
7355charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007358 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007359 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360{
7361 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007362 Py_ssize_t repsize;
7363 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 Py_UNICODE *uni2;
7365 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007366 Py_ssize_t collstartpos = *inpos;
7367 Py_ssize_t collendpos = *inpos+1;
7368 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 char *encoding = "charmap";
7370 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007371 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 /* find all unencodable characters */
7374 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007375 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007376 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 int res = encoding_map_lookup(p[collendpos], mapping);
7378 if (res != -1)
7379 break;
7380 ++collendpos;
7381 continue;
7382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 rep = charmapencode_lookup(p[collendpos], mapping);
7385 if (rep==NULL)
7386 return -1;
7387 else if (rep!=Py_None) {
7388 Py_DECREF(rep);
7389 break;
7390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007391 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 }
7394 /* cache callback name lookup
7395 * (if not done yet, i.e. it's the first error) */
7396 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 if ((errors==NULL) || (!strcmp(errors, "strict")))
7398 *known_errorHandler = 1;
7399 else if (!strcmp(errors, "replace"))
7400 *known_errorHandler = 2;
7401 else if (!strcmp(errors, "ignore"))
7402 *known_errorHandler = 3;
7403 else if (!strcmp(errors, "xmlcharrefreplace"))
7404 *known_errorHandler = 4;
7405 else
7406 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 }
7408 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007409 case 1: /* strict */
7410 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7411 return -1;
7412 case 2: /* replace */
7413 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 x = charmapencode_output('?', mapping, res, respos);
7415 if (x==enc_EXCEPTION) {
7416 return -1;
7417 }
7418 else if (x==enc_FAILED) {
7419 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7420 return -1;
7421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422 }
7423 /* fall through */
7424 case 3: /* ignore */
7425 *inpos = collendpos;
7426 break;
7427 case 4: /* xmlcharrefreplace */
7428 /* generate replacement (temporarily (mis)uses p) */
7429 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 char buffer[2+29+1+1];
7431 char *cp;
7432 sprintf(buffer, "&#%d;", (int)p[collpos]);
7433 for (cp = buffer; *cp; ++cp) {
7434 x = charmapencode_output(*cp, mapping, res, respos);
7435 if (x==enc_EXCEPTION)
7436 return -1;
7437 else if (x==enc_FAILED) {
7438 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7439 return -1;
7440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007441 }
7442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007443 *inpos = collendpos;
7444 break;
7445 default:
7446 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 encoding, reason, p, size, exceptionObject,
7448 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007449 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007451 if (PyBytes_Check(repunicode)) {
7452 /* Directly copy bytes result to output. */
7453 Py_ssize_t outsize = PyBytes_Size(*res);
7454 Py_ssize_t requiredsize;
7455 repsize = PyBytes_Size(repunicode);
7456 requiredsize = *respos + repsize;
7457 if (requiredsize > outsize)
7458 /* Make room for all additional bytes. */
7459 if (charmapencode_resize(res, respos, requiredsize)) {
7460 Py_DECREF(repunicode);
7461 return -1;
7462 }
7463 memcpy(PyBytes_AsString(*res) + *respos,
7464 PyBytes_AsString(repunicode), repsize);
7465 *respos += repsize;
7466 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007467 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007468 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007470 /* generate replacement */
7471 repsize = PyUnicode_GET_SIZE(repunicode);
7472 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 x = charmapencode_output(*uni2, mapping, res, respos);
7474 if (x==enc_EXCEPTION) {
7475 return -1;
7476 }
7477 else if (x==enc_FAILED) {
7478 Py_DECREF(repunicode);
7479 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7480 return -1;
7481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 }
7483 *inpos = newpos;
7484 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485 }
7486 return 0;
7487}
7488
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489PyObject *
7490PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7491 Py_ssize_t size,
7492 PyObject *mapping,
7493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495 /* output object */
7496 PyObject *res = NULL;
7497 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007501 PyObject *errorHandler = NULL;
7502 PyObject *exc = NULL;
7503 /* the following variable is used for caching string comparisons
7504 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7505 * 3=ignore, 4=xmlcharrefreplace */
7506 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
7508 /* Default to Latin-1 */
7509 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512 /* allocate enough for a simple encoding without
7513 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007514 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515 if (res == NULL)
7516 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007517 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007520 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 /* try to encode it */
7522 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7523 if (x==enc_EXCEPTION) /* error */
7524 goto onError;
7525 if (x==enc_FAILED) { /* unencodable character */
7526 if (charmap_encoding_error(p, size, &inpos, mapping,
7527 &exc,
7528 &known_errorHandler, &errorHandler, errors,
7529 &res, &respos)) {
7530 goto onError;
7531 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 else
7534 /* done with this character => adjust input position */
7535 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007538 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007539 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007540 if (_PyBytes_Resize(&res, respos) < 0)
7541 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543 Py_XDECREF(exc);
7544 Py_XDECREF(errorHandler);
7545 return res;
7546
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007548 Py_XDECREF(res);
7549 Py_XDECREF(exc);
7550 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 return NULL;
7552}
7553
Alexander Belopolsky40018472011-02-26 01:02:56 +00007554PyObject *
7555PyUnicode_AsCharmapString(PyObject *unicode,
7556 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557{
7558 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 PyErr_BadArgument();
7560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 }
7562 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 PyUnicode_GET_SIZE(unicode),
7564 mapping,
7565 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566}
7567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569static void
7570make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007571 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007572 Py_ssize_t startpos, Py_ssize_t endpos,
7573 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007576 *exceptionObject = _PyUnicodeTranslateError_Create(
7577 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 }
7579 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7581 goto onError;
7582 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7583 goto onError;
7584 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7585 goto onError;
7586 return;
7587 onError:
7588 Py_DECREF(*exceptionObject);
7589 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 }
7591}
7592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007594static void
7595raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007596 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007597 Py_ssize_t startpos, Py_ssize_t endpos,
7598 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599{
7600 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007601 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007604}
7605
7606/* error handling callback helper:
7607 build arguments, call the callback and check the arguments,
7608 put the result into newpos and return the replacement string, which
7609 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007610static PyObject *
7611unicode_translate_call_errorhandler(const char *errors,
7612 PyObject **errorHandler,
7613 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007615 Py_ssize_t startpos, Py_ssize_t endpos,
7616 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007618 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007620 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621 PyObject *restuple;
7622 PyObject *resunicode;
7623
7624 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007626 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 }
7629
7630 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634
7635 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007640 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 Py_DECREF(restuple);
7642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643 }
7644 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 &resunicode, &i_newpos)) {
7646 Py_DECREF(restuple);
7647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007650 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651 else
7652 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007653 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7655 Py_DECREF(restuple);
7656 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 Py_INCREF(resunicode);
7659 Py_DECREF(restuple);
7660 return resunicode;
7661}
7662
7663/* Lookup the character ch in the mapping and put the result in result,
7664 which must be decrefed by the caller.
7665 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007666static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007667charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007668{
Christian Heimes217cfd12007-12-02 14:31:20 +00007669 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670 PyObject *x;
7671
7672 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 x = PyObject_GetItem(mapping, w);
7675 Py_DECREF(w);
7676 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7678 /* No mapping found means: use 1:1 mapping. */
7679 PyErr_Clear();
7680 *result = NULL;
7681 return 0;
7682 } else
7683 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007684 }
7685 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 *result = x;
7687 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007689 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 long value = PyLong_AS_LONG(x);
7691 long max = PyUnicode_GetMax();
7692 if (value < 0 || value > max) {
7693 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007694 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 Py_DECREF(x);
7696 return -1;
7697 }
7698 *result = x;
7699 return 0;
7700 }
7701 else if (PyUnicode_Check(x)) {
7702 *result = x;
7703 return 0;
7704 }
7705 else {
7706 /* wrong return value */
7707 PyErr_SetString(PyExc_TypeError,
7708 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709 Py_DECREF(x);
7710 return -1;
7711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712}
7713/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 if not reallocate and adjust various state variables.
7715 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007716static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007720 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007721 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 /* exponentially overallocate to minimize reallocations */
7723 if (requiredsize < 2 * oldsize)
7724 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007725 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7726 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007729 }
7730 return 0;
7731}
7732/* lookup the character, put the result in the output string and adjust
7733 various state variables. Return a new reference to the object that
7734 was put in the output buffer in *result, or Py_None, if the mapping was
7735 undefined (in which case no character was written).
7736 The called must decref result.
7737 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007738static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007739charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7740 PyObject *mapping, Py_UCS4 **output,
7741 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007742 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007744 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7745 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007747 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007749 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750 }
7751 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007753 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007755 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 }
7757 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007758 Py_ssize_t repsize;
7759 if (PyUnicode_READY(*res) == -1)
7760 return -1;
7761 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 if (repsize==1) {
7763 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 }
7766 else if (repsize!=0) {
7767 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 Py_ssize_t requiredsize = *opos +
7769 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 Py_ssize_t i;
7772 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007774 for(i = 0; i < repsize; i++)
7775 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 }
7778 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007780 return 0;
7781}
7782
Alexander Belopolsky40018472011-02-26 01:02:56 +00007783PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784_PyUnicode_TranslateCharmap(PyObject *input,
7785 PyObject *mapping,
7786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 /* input object */
7789 char *idata;
7790 Py_ssize_t size, i;
7791 int kind;
7792 /* output buffer */
7793 Py_UCS4 *output = NULL;
7794 Py_ssize_t osize;
7795 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798 char *reason = "character maps to <undefined>";
7799 PyObject *errorHandler = NULL;
7800 PyObject *exc = NULL;
7801 /* the following variable is used for caching string comparisons
7802 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7803 * 3=ignore, 4=xmlcharrefreplace */
7804 int known_errorHandler = -1;
7805
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 PyErr_BadArgument();
7808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007811 if (PyUnicode_READY(input) == -1)
7812 return NULL;
7813 idata = (char*)PyUnicode_DATA(input);
7814 kind = PyUnicode_KIND(input);
7815 size = PyUnicode_GET_LENGTH(input);
7816 i = 0;
7817
7818 if (size == 0) {
7819 Py_INCREF(input);
7820 return input;
7821 }
7822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 /* allocate enough for a simple 1:1 translation without
7824 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007825 osize = size;
7826 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7827 opos = 0;
7828 if (output == NULL) {
7829 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007833 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 /* try to encode it */
7835 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 if (charmaptranslate_output(input, i, mapping,
7837 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 Py_XDECREF(x);
7839 goto onError;
7840 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 else { /* untranslatable character */
7845 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7846 Py_ssize_t repsize;
7847 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 Py_ssize_t collstart = i;
7851 Py_ssize_t collend = i+1;
7852 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 while (collend < size) {
7856 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 goto onError;
7858 Py_XDECREF(x);
7859 if (x!=Py_None)
7860 break;
7861 ++collend;
7862 }
7863 /* cache callback name lookup
7864 * (if not done yet, i.e. it's the first error) */
7865 if (known_errorHandler==-1) {
7866 if ((errors==NULL) || (!strcmp(errors, "strict")))
7867 known_errorHandler = 1;
7868 else if (!strcmp(errors, "replace"))
7869 known_errorHandler = 2;
7870 else if (!strcmp(errors, "ignore"))
7871 known_errorHandler = 3;
7872 else if (!strcmp(errors, "xmlcharrefreplace"))
7873 known_errorHandler = 4;
7874 else
7875 known_errorHandler = 0;
7876 }
7877 switch (known_errorHandler) {
7878 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 raise_translate_exception(&exc, input, collstart,
7880 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 case 2: /* replace */
7883 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 for (coll = collstart; coll<collend; coll++)
7885 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 /* fall through */
7887 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 break;
7890 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 /* generate replacement (temporarily (mis)uses i) */
7892 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 char buffer[2+29+1+1];
7894 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7896 if (charmaptranslate_makespace(&output, &osize,
7897 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 goto onError;
7899 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 break;
7904 default:
7905 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 reason, input, &exc,
7907 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007908 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 goto onError;
7910 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007911 repsize = PyUnicode_GET_LENGTH(repunicode);
7912 if (charmaptranslate_makespace(&output, &osize,
7913 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 Py_DECREF(repunicode);
7915 goto onError;
7916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 for (uni2 = 0; repsize-->0; ++uni2)
7918 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7919 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 }
7923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7925 if (!res)
7926 goto onError;
7927 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007928 Py_XDECREF(exc);
7929 Py_XDECREF(errorHandler);
7930 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007933 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 Py_XDECREF(exc);
7935 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 return NULL;
7937}
7938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939/* Deprecated. Use PyUnicode_Translate instead. */
7940PyObject *
7941PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7942 Py_ssize_t size,
7943 PyObject *mapping,
7944 const char *errors)
7945{
7946 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7947 if (!unicode)
7948 return NULL;
7949 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7950}
7951
Alexander Belopolsky40018472011-02-26 01:02:56 +00007952PyObject *
7953PyUnicode_Translate(PyObject *str,
7954 PyObject *mapping,
7955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
7957 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 str = PyUnicode_FromObject(str);
7960 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 Py_DECREF(str);
7964 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007965
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 Py_XDECREF(str);
7968 return NULL;
7969}
Tim Petersced69f82003-09-16 20:30:58 +00007970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971static Py_UCS4
7972fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7973{
7974 /* No need to call PyUnicode_READY(self) because this function is only
7975 called as a callback from fixup() which does it already. */
7976 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7977 const int kind = PyUnicode_KIND(self);
7978 void *data = PyUnicode_DATA(self);
7979 Py_UCS4 maxchar = 0, ch, fixed;
7980 Py_ssize_t i;
7981
7982 for (i = 0; i < len; ++i) {
7983 ch = PyUnicode_READ(kind, data, i);
7984 fixed = 0;
7985 if (ch > 127) {
7986 if (Py_UNICODE_ISSPACE(ch))
7987 fixed = ' ';
7988 else {
7989 const int decimal = Py_UNICODE_TODECIMAL(ch);
7990 if (decimal >= 0)
7991 fixed = '0' + decimal;
7992 }
7993 if (fixed != 0) {
7994 if (fixed > maxchar)
7995 maxchar = fixed;
7996 PyUnicode_WRITE(kind, data, i, fixed);
7997 }
7998 else if (ch > maxchar)
7999 maxchar = ch;
8000 }
8001 else if (ch > maxchar)
8002 maxchar = ch;
8003 }
8004
8005 return maxchar;
8006}
8007
8008PyObject *
8009_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8010{
8011 if (!PyUnicode_Check(unicode)) {
8012 PyErr_BadInternalCall();
8013 return NULL;
8014 }
8015 if (PyUnicode_READY(unicode) == -1)
8016 return NULL;
8017 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8018 /* If the string is already ASCII, just return the same string */
8019 Py_INCREF(unicode);
8020 return unicode;
8021 }
8022 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8023}
8024
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008025PyObject *
8026PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8027 Py_ssize_t length)
8028{
8029 PyObject *result;
8030 Py_UNICODE *p; /* write pointer into result */
8031 Py_ssize_t i;
8032 /* Copy to a new string */
8033 result = (PyObject *)_PyUnicode_New(length);
8034 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8035 if (result == NULL)
8036 return result;
8037 p = PyUnicode_AS_UNICODE(result);
8038 /* Iterate over code points */
8039 for (i = 0; i < length; i++) {
8040 Py_UNICODE ch =s[i];
8041 if (ch > 127) {
8042 int decimal = Py_UNICODE_TODECIMAL(ch);
8043 if (decimal >= 0)
8044 p[i] = '0' + decimal;
8045 }
8046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8048 Py_DECREF(result);
8049 return NULL;
8050 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008051 return result;
8052}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008053/* --- Decimal Encoder ---------------------------------------------------- */
8054
Alexander Belopolsky40018472011-02-26 01:02:56 +00008055int
8056PyUnicode_EncodeDecimal(Py_UNICODE *s,
8057 Py_ssize_t length,
8058 char *output,
8059 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008060{
8061 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062 PyObject *errorHandler = NULL;
8063 PyObject *exc = NULL;
8064 const char *encoding = "decimal";
8065 const char *reason = "invalid decimal Unicode string";
8066 /* the following variable is used for caching string comparisons
8067 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8068 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008069
8070 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 PyErr_BadArgument();
8072 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008073 }
8074
8075 p = s;
8076 end = s + length;
8077 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 register Py_UNICODE ch = *p;
8079 int decimal;
8080 PyObject *repunicode;
8081 Py_ssize_t repsize;
8082 Py_ssize_t newpos;
8083 Py_UNICODE *uni2;
8084 Py_UNICODE *collstart;
8085 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008086
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008088 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 ++p;
8090 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 decimal = Py_UNICODE_TODECIMAL(ch);
8093 if (decimal >= 0) {
8094 *output++ = '0' + decimal;
8095 ++p;
8096 continue;
8097 }
8098 if (0 < ch && ch < 256) {
8099 *output++ = (char)ch;
8100 ++p;
8101 continue;
8102 }
8103 /* All other characters are considered unencodable */
8104 collstart = p;
8105 collend = p+1;
8106 while (collend < end) {
8107 if ((0 < *collend && *collend < 256) ||
8108 !Py_UNICODE_ISSPACE(*collend) ||
8109 Py_UNICODE_TODECIMAL(*collend))
8110 break;
8111 }
8112 /* cache callback name lookup
8113 * (if not done yet, i.e. it's the first error) */
8114 if (known_errorHandler==-1) {
8115 if ((errors==NULL) || (!strcmp(errors, "strict")))
8116 known_errorHandler = 1;
8117 else if (!strcmp(errors, "replace"))
8118 known_errorHandler = 2;
8119 else if (!strcmp(errors, "ignore"))
8120 known_errorHandler = 3;
8121 else if (!strcmp(errors, "xmlcharrefreplace"))
8122 known_errorHandler = 4;
8123 else
8124 known_errorHandler = 0;
8125 }
8126 switch (known_errorHandler) {
8127 case 1: /* strict */
8128 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8129 goto onError;
8130 case 2: /* replace */
8131 for (p = collstart; p < collend; ++p)
8132 *output++ = '?';
8133 /* fall through */
8134 case 3: /* ignore */
8135 p = collend;
8136 break;
8137 case 4: /* xmlcharrefreplace */
8138 /* generate replacement (temporarily (mis)uses p) */
8139 for (p = collstart; p < collend; ++p)
8140 output += sprintf(output, "&#%d;", (int)*p);
8141 p = collend;
8142 break;
8143 default:
8144 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8145 encoding, reason, s, length, &exc,
8146 collstart-s, collend-s, &newpos);
8147 if (repunicode == NULL)
8148 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008149 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008150 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008151 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8152 Py_DECREF(repunicode);
8153 goto onError;
8154 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 /* generate replacement */
8156 repsize = PyUnicode_GET_SIZE(repunicode);
8157 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8158 Py_UNICODE ch = *uni2;
8159 if (Py_UNICODE_ISSPACE(ch))
8160 *output++ = ' ';
8161 else {
8162 decimal = Py_UNICODE_TODECIMAL(ch);
8163 if (decimal >= 0)
8164 *output++ = '0' + decimal;
8165 else if (0 < ch && ch < 256)
8166 *output++ = (char)ch;
8167 else {
8168 Py_DECREF(repunicode);
8169 raise_encode_exception(&exc, encoding,
8170 s, length, collstart-s, collend-s, reason);
8171 goto onError;
8172 }
8173 }
8174 }
8175 p = s + newpos;
8176 Py_DECREF(repunicode);
8177 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008178 }
8179 /* 0-terminate the output string */
8180 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008181 Py_XDECREF(exc);
8182 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008183 return 0;
8184
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008186 Py_XDECREF(exc);
8187 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008188 return -1;
8189}
8190
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191/* --- Helpers ------------------------------------------------------------ */
8192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193#include "stringlib/ucs1lib.h"
8194#include "stringlib/fastsearch.h"
8195#include "stringlib/partition.h"
8196#include "stringlib/split.h"
8197#include "stringlib/count.h"
8198#include "stringlib/find.h"
8199#include "stringlib/localeutil.h"
8200#include "stringlib/undef.h"
8201
8202#include "stringlib/ucs2lib.h"
8203#include "stringlib/fastsearch.h"
8204#include "stringlib/partition.h"
8205#include "stringlib/split.h"
8206#include "stringlib/count.h"
8207#include "stringlib/find.h"
8208#include "stringlib/localeutil.h"
8209#include "stringlib/undef.h"
8210
8211#include "stringlib/ucs4lib.h"
8212#include "stringlib/fastsearch.h"
8213#include "stringlib/partition.h"
8214#include "stringlib/split.h"
8215#include "stringlib/count.h"
8216#include "stringlib/find.h"
8217#include "stringlib/localeutil.h"
8218#include "stringlib/undef.h"
8219
8220static Py_ssize_t
8221any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8222 const Py_UCS1*, Py_ssize_t,
8223 Py_ssize_t, Py_ssize_t),
8224 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8225 const Py_UCS2*, Py_ssize_t,
8226 Py_ssize_t, Py_ssize_t),
8227 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8228 const Py_UCS4*, Py_ssize_t,
8229 Py_ssize_t, Py_ssize_t),
8230 PyObject* s1, PyObject* s2,
8231 Py_ssize_t start,
8232 Py_ssize_t end)
8233{
8234 int kind1, kind2, kind;
8235 void *buf1, *buf2;
8236 Py_ssize_t len1, len2, result;
8237
8238 kind1 = PyUnicode_KIND(s1);
8239 kind2 = PyUnicode_KIND(s2);
8240 kind = kind1 > kind2 ? kind1 : kind2;
8241 buf1 = PyUnicode_DATA(s1);
8242 buf2 = PyUnicode_DATA(s2);
8243 if (kind1 != kind)
8244 buf1 = _PyUnicode_AsKind(s1, kind);
8245 if (!buf1)
8246 return -2;
8247 if (kind2 != kind)
8248 buf2 = _PyUnicode_AsKind(s2, kind);
8249 if (!buf2) {
8250 if (kind1 != kind) PyMem_Free(buf1);
8251 return -2;
8252 }
8253 len1 = PyUnicode_GET_LENGTH(s1);
8254 len2 = PyUnicode_GET_LENGTH(s2);
8255
8256 switch(kind) {
8257 case PyUnicode_1BYTE_KIND:
8258 result = ucs1(buf1, len1, buf2, len2, start, end);
8259 break;
8260 case PyUnicode_2BYTE_KIND:
8261 result = ucs2(buf1, len1, buf2, len2, start, end);
8262 break;
8263 case PyUnicode_4BYTE_KIND:
8264 result = ucs4(buf1, len1, buf2, len2, start, end);
8265 break;
8266 default:
8267 assert(0); result = -2;
8268 }
8269
8270 if (kind1 != kind)
8271 PyMem_Free(buf1);
8272 if (kind2 != kind)
8273 PyMem_Free(buf2);
8274
8275 return result;
8276}
8277
8278Py_ssize_t
8279_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8280 Py_ssize_t n_buffer,
8281 void *digits, Py_ssize_t n_digits,
8282 Py_ssize_t min_width,
8283 const char *grouping,
8284 const char *thousands_sep)
8285{
8286 switch(kind) {
8287 case PyUnicode_1BYTE_KIND:
8288 return _PyUnicode_ucs1_InsertThousandsGrouping(
8289 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8290 min_width, grouping, thousands_sep);
8291 case PyUnicode_2BYTE_KIND:
8292 return _PyUnicode_ucs2_InsertThousandsGrouping(
8293 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8294 min_width, grouping, thousands_sep);
8295 case PyUnicode_4BYTE_KIND:
8296 return _PyUnicode_ucs4_InsertThousandsGrouping(
8297 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8298 min_width, grouping, thousands_sep);
8299 }
8300 assert(0);
8301 return -1;
8302}
8303
8304
Eric Smith8c663262007-08-25 02:26:07 +00008305#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008306#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008307
Thomas Wouters477c8d52006-05-27 19:21:47 +00008308#include "stringlib/count.h"
8309#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008310
Thomas Wouters477c8d52006-05-27 19:21:47 +00008311/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008312#define ADJUST_INDICES(start, end, len) \
8313 if (end > len) \
8314 end = len; \
8315 else if (end < 0) { \
8316 end += len; \
8317 if (end < 0) \
8318 end = 0; \
8319 } \
8320 if (start < 0) { \
8321 start += len; \
8322 if (start < 0) \
8323 start = 0; \
8324 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008325
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326Py_ssize_t
8327PyUnicode_Count(PyObject *str,
8328 PyObject *substr,
8329 Py_ssize_t start,
8330 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008332 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008333 PyUnicodeObject* str_obj;
8334 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 int kind1, kind2, kind;
8336 void *buf1 = NULL, *buf2 = NULL;
8337 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008338
Thomas Wouters477c8d52006-05-27 19:21:47 +00008339 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008342 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008343 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 Py_DECREF(str_obj);
8345 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
Tim Petersced69f82003-09-16 20:30:58 +00008347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 kind1 = PyUnicode_KIND(str_obj);
8349 kind2 = PyUnicode_KIND(sub_obj);
8350 kind = kind1 > kind2 ? kind1 : kind2;
8351 buf1 = PyUnicode_DATA(str_obj);
8352 if (kind1 != kind)
8353 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8354 if (!buf1)
8355 goto onError;
8356 buf2 = PyUnicode_DATA(sub_obj);
8357 if (kind2 != kind)
8358 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8359 if (!buf2)
8360 goto onError;
8361 len1 = PyUnicode_GET_LENGTH(str_obj);
8362 len2 = PyUnicode_GET_LENGTH(sub_obj);
8363
8364 ADJUST_INDICES(start, end, len1);
8365 switch(kind) {
8366 case PyUnicode_1BYTE_KIND:
8367 result = ucs1lib_count(
8368 ((Py_UCS1*)buf1) + start, end - start,
8369 buf2, len2, PY_SSIZE_T_MAX
8370 );
8371 break;
8372 case PyUnicode_2BYTE_KIND:
8373 result = ucs2lib_count(
8374 ((Py_UCS2*)buf1) + start, end - start,
8375 buf2, len2, PY_SSIZE_T_MAX
8376 );
8377 break;
8378 case PyUnicode_4BYTE_KIND:
8379 result = ucs4lib_count(
8380 ((Py_UCS4*)buf1) + start, end - start,
8381 buf2, len2, PY_SSIZE_T_MAX
8382 );
8383 break;
8384 default:
8385 assert(0); result = 0;
8386 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387
8388 Py_DECREF(sub_obj);
8389 Py_DECREF(str_obj);
8390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 if (kind1 != kind)
8392 PyMem_Free(buf1);
8393 if (kind2 != kind)
8394 PyMem_Free(buf2);
8395
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 onError:
8398 Py_DECREF(sub_obj);
8399 Py_DECREF(str_obj);
8400 if (kind1 != kind && buf1)
8401 PyMem_Free(buf1);
8402 if (kind2 != kind && buf2)
8403 PyMem_Free(buf2);
8404 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405}
8406
Alexander Belopolsky40018472011-02-26 01:02:56 +00008407Py_ssize_t
8408PyUnicode_Find(PyObject *str,
8409 PyObject *sub,
8410 Py_ssize_t start,
8411 Py_ssize_t end,
8412 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008414 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008415
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008419 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 Py_DECREF(str);
8422 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 }
Tim Petersced69f82003-09-16 20:30:58 +00008424
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 result = any_find_slice(
8427 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8428 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008429 );
8430 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 result = any_find_slice(
8432 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8433 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008434 );
8435
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008437 Py_DECREF(sub);
8438
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 return result;
8440}
8441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442Py_ssize_t
8443PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8444 Py_ssize_t start, Py_ssize_t end,
8445 int direction)
8446{
8447 char *result;
8448 int kind;
8449 if (PyUnicode_READY(str) == -1)
8450 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008451 if (start < 0 || end < 0) {
8452 PyErr_SetString(PyExc_IndexError, "string index out of range");
8453 return -2;
8454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 if (end > PyUnicode_GET_LENGTH(str))
8456 end = PyUnicode_GET_LENGTH(str);
8457 kind = PyUnicode_KIND(str);
8458 result = findchar(PyUnicode_1BYTE_DATA(str)
8459 + PyUnicode_KIND_SIZE(kind, start),
8460 kind,
8461 end-start, ch, direction);
8462 if (!result)
8463 return -1;
8464 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8465}
8466
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static int
8468tailmatch(PyUnicodeObject *self,
8469 PyUnicodeObject *substring,
8470 Py_ssize_t start,
8471 Py_ssize_t end,
8472 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 int kind_self;
8475 int kind_sub;
8476 void *data_self;
8477 void *data_sub;
8478 Py_ssize_t offset;
8479 Py_ssize_t i;
8480 Py_ssize_t end_sub;
8481
8482 if (PyUnicode_READY(self) == -1 ||
8483 PyUnicode_READY(substring) == -1)
8484 return 0;
8485
8486 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 return 1;
8488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8490 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 kind_self = PyUnicode_KIND(self);
8495 data_self = PyUnicode_DATA(self);
8496 kind_sub = PyUnicode_KIND(substring);
8497 data_sub = PyUnicode_DATA(substring);
8498 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8499
8500 if (direction > 0)
8501 offset = end;
8502 else
8503 offset = start;
8504
8505 if (PyUnicode_READ(kind_self, data_self, offset) ==
8506 PyUnicode_READ(kind_sub, data_sub, 0) &&
8507 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8508 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8509 /* If both are of the same kind, memcmp is sufficient */
8510 if (kind_self == kind_sub) {
8511 return ! memcmp((char *)data_self +
8512 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8513 data_sub,
8514 PyUnicode_GET_LENGTH(substring) *
8515 PyUnicode_CHARACTER_SIZE(substring));
8516 }
8517 /* otherwise we have to compare each character by first accesing it */
8518 else {
8519 /* We do not need to compare 0 and len(substring)-1 because
8520 the if statement above ensured already that they are equal
8521 when we end up here. */
8522 // TODO: honor direction and do a forward or backwards search
8523 for (i = 1; i < end_sub; ++i) {
8524 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8525 PyUnicode_READ(kind_sub, data_sub, i))
8526 return 0;
8527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 }
8531
8532 return 0;
8533}
8534
Alexander Belopolsky40018472011-02-26 01:02:56 +00008535Py_ssize_t
8536PyUnicode_Tailmatch(PyObject *str,
8537 PyObject *substr,
8538 Py_ssize_t start,
8539 Py_ssize_t end,
8540 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008543
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 str = PyUnicode_FromObject(str);
8545 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 substr = PyUnicode_FromObject(substr);
8548 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 Py_DECREF(str);
8550 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 }
Tim Petersced69f82003-09-16 20:30:58 +00008552
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 (PyUnicodeObject *)substr,
8555 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 Py_DECREF(str);
8557 Py_DECREF(substr);
8558 return result;
8559}
8560
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561/* Apply fixfct filter to the Unicode object self and return a
8562 reference to the modified object */
8563
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564static PyObject *
8565fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 PyObject *u;
8569 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 if (PyUnicode_READY(self) == -1)
8572 return NULL;
8573 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8574 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8575 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8580 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 /* fix functions return the new maximum character in a string,
8583 if the kind of the resulting unicode object does not change,
8584 everything is fine. Otherwise we need to change the string kind
8585 and re-run the fix function. */
8586 maxchar_new = fixfct((PyUnicodeObject*)u);
8587 if (maxchar_new == 0)
8588 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8589 else if (maxchar_new <= 127)
8590 maxchar_new = 127;
8591 else if (maxchar_new <= 255)
8592 maxchar_new = 255;
8593 else if (maxchar_new <= 65535)
8594 maxchar_new = 65535;
8595 else
8596 maxchar_new = 1114111; /* 0x10ffff */
8597
8598 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 /* fixfct should return TRUE if it modified the buffer. If
8600 FALSE, return a reference to the original buffer instead
8601 (to save space, not time) */
8602 Py_INCREF(self);
8603 Py_DECREF(u);
8604 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 else if (maxchar_new == maxchar_old) {
8607 return u;
8608 }
8609 else {
8610 /* In case the maximum character changed, we need to
8611 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008612 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 if (v == NULL) {
8614 Py_DECREF(u);
8615 return NULL;
8616 }
8617 if (maxchar_new > maxchar_old) {
8618 /* If the maxchar increased so that the kind changed, not all
8619 characters are representable anymore and we need to fix the
8620 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008621 if (PyUnicode_CopyCharacters(v, 0,
8622 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008623 PyUnicode_GET_LENGTH(self)) < 0)
8624 {
8625 Py_DECREF(u);
8626 return NULL;
8627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 maxchar_old = fixfct((PyUnicodeObject*)v);
8629 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8630 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008631 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008632 if (PyUnicode_CopyCharacters(v, 0,
8633 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008634 PyUnicode_GET_LENGTH(self)) < 0)
8635 {
8636 Py_DECREF(u);
8637 return NULL;
8638 }
8639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640
8641 Py_DECREF(u);
8642 return v;
8643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644}
8645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008647fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 /* No need to call PyUnicode_READY(self) because this function is only
8650 called as a callback from fixup() which does it already. */
8651 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8652 const int kind = PyUnicode_KIND(self);
8653 void *data = PyUnicode_DATA(self);
8654 int touched = 0;
8655 Py_UCS4 maxchar = 0;
8656 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 for (i = 0; i < len; ++i) {
8659 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8660 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8661 if (up != ch) {
8662 if (up > maxchar)
8663 maxchar = up;
8664 PyUnicode_WRITE(kind, data, i, up);
8665 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 else if (ch > maxchar)
8668 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 }
8670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 if (touched)
8672 return maxchar;
8673 else
8674 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675}
8676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008678fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8681 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8682 const int kind = PyUnicode_KIND(self);
8683 void *data = PyUnicode_DATA(self);
8684 int touched = 0;
8685 Py_UCS4 maxchar = 0;
8686 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 for(i = 0; i < len; ++i) {
8689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8690 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8691 if (lo != ch) {
8692 if (lo > maxchar)
8693 maxchar = lo;
8694 PyUnicode_WRITE(kind, data, i, lo);
8695 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 else if (ch > maxchar)
8698 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 }
8700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (touched)
8702 return maxchar;
8703 else
8704 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705}
8706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008708fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8711 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8712 const int kind = PyUnicode_KIND(self);
8713 void *data = PyUnicode_DATA(self);
8714 int touched = 0;
8715 Py_UCS4 maxchar = 0;
8716 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 for(i = 0; i < len; ++i) {
8719 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8720 Py_UCS4 nu = 0;
8721
8722 if (Py_UNICODE_ISUPPER(ch))
8723 nu = Py_UNICODE_TOLOWER(ch);
8724 else if (Py_UNICODE_ISLOWER(ch))
8725 nu = Py_UNICODE_TOUPPER(ch);
8726
8727 if (nu != 0) {
8728 if (nu > maxchar)
8729 maxchar = nu;
8730 PyUnicode_WRITE(kind, data, i, nu);
8731 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 else if (ch > maxchar)
8734 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 }
8736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 if (touched)
8738 return maxchar;
8739 else
8740 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741}
8742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8747 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8748 const int kind = PyUnicode_KIND(self);
8749 void *data = PyUnicode_DATA(self);
8750 int touched = 0;
8751 Py_UCS4 maxchar = 0;
8752 Py_ssize_t i = 0;
8753 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008754
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008755 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757
8758 ch = PyUnicode_READ(kind, data, i);
8759 if (!Py_UNICODE_ISUPPER(ch)) {
8760 maxchar = Py_UNICODE_TOUPPER(ch);
8761 PyUnicode_WRITE(kind, data, i, maxchar);
8762 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 ++i;
8765 for(; i < len; ++i) {
8766 ch = PyUnicode_READ(kind, data, i);
8767 if (!Py_UNICODE_ISLOWER(ch)) {
8768 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8769 if (lo > maxchar)
8770 maxchar = lo;
8771 PyUnicode_WRITE(kind, data, i, lo);
8772 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 else if (ch > maxchar)
8775 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777
8778 if (touched)
8779 return maxchar;
8780 else
8781 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8788 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8789 const int kind = PyUnicode_KIND(self);
8790 void *data = PyUnicode_DATA(self);
8791 Py_UCS4 maxchar = 0;
8792 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 int previous_is_cased;
8794
8795 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 if (len == 1) {
8797 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8798 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8799 if (ti != ch) {
8800 PyUnicode_WRITE(kind, data, i, ti);
8801 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 }
8803 else
8804 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 for(; i < len; ++i) {
8808 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8809 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008810
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 nu = Py_UNICODE_TOTITLE(ch);
8815
8816 if (nu > maxchar)
8817 maxchar = nu;
8818 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008819
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 if (Py_UNICODE_ISLOWER(ch) ||
8821 Py_UNICODE_ISUPPER(ch) ||
8822 Py_UNICODE_ISTITLE(ch))
8823 previous_is_cased = 1;
8824 else
8825 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828}
8829
Tim Peters8ce9f162004-08-27 01:49:32 +00008830PyObject *
8831PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008834 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008836 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008837 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8838 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008839 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 Py_ssize_t sz, i, res_offset;
8841 Py_UCS4 maxchar = 0;
8842 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843
Tim Peters05eba1f2004-08-27 21:32:02 +00008844 fseq = PySequence_Fast(seq, "");
8845 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008846 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008847 }
8848
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008849 /* NOTE: the following code can't call back into Python code,
8850 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008851 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008852
Tim Peters05eba1f2004-08-27 21:32:02 +00008853 seqlen = PySequence_Fast_GET_SIZE(fseq);
8854 /* If empty sequence, return u"". */
8855 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008858 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008859 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008860 /* If singleton sequence with an exact Unicode, return that. */
8861 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 item = items[0];
8863 if (PyUnicode_CheckExact(item)) {
8864 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 goto Done;
8867 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008868 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008869 else {
8870 /* Set up sep and seplen */
8871 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 /* fall back to a blank space separator */
8873 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008874 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008876 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008877 else {
8878 if (!PyUnicode_Check(separator)) {
8879 PyErr_Format(PyExc_TypeError,
8880 "separator: expected str instance,"
8881 " %.80s found",
8882 Py_TYPE(separator)->tp_name);
8883 goto onError;
8884 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008885 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 goto onError;
8887 sep = separator;
8888 seplen = PyUnicode_GET_LENGTH(separator);
8889 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8890 /* inc refcount to keep this code path symetric with the
8891 above case of a blank separator */
8892 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008893 }
8894 }
8895
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008896 /* There are at least two things to join, or else we have a subclass
8897 * of str in the sequence.
8898 * Do a pre-pass to figure out the total amount of space we'll
8899 * need (sz), and see whether all argument are strings.
8900 */
8901 sz = 0;
8902 for (i = 0; i < seqlen; i++) {
8903 const Py_ssize_t old_sz = sz;
8904 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 if (!PyUnicode_Check(item)) {
8906 PyErr_Format(PyExc_TypeError,
8907 "sequence item %zd: expected str instance,"
8908 " %.80s found",
8909 i, Py_TYPE(item)->tp_name);
8910 goto onError;
8911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 if (PyUnicode_READY(item) == -1)
8913 goto onError;
8914 sz += PyUnicode_GET_LENGTH(item);
8915 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8916 if (item_maxchar > maxchar)
8917 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008918 if (i != 0)
8919 sz += seplen;
8920 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8921 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008923 goto onError;
8924 }
8925 }
Tim Petersced69f82003-09-16 20:30:58 +00008926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008928 if (res == NULL)
8929 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008930
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008931 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008933 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008934 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008936 if (i && seplen != 0) {
8937 copied = PyUnicode_CopyCharacters(res, res_offset,
8938 sep, 0, seplen);
8939 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008940 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008941#ifdef Py_DEBUG
8942 res_offset += copied;
8943#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008945#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008947 itemlen = PyUnicode_GET_LENGTH(item);
8948 if (itemlen != 0) {
8949 copied = PyUnicode_CopyCharacters(res, res_offset,
8950 item, 0, itemlen);
8951 if (copied < 0)
8952 goto onError;
8953#ifdef Py_DEBUG
8954 res_offset += copied;
8955#else
8956 res_offset += itemlen;
8957#endif
8958 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008961
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008963 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 Py_XDECREF(sep);
8965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008968 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008970 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 return NULL;
8972}
8973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974#define FILL(kind, data, value, start, length) \
8975 do { \
8976 Py_ssize_t i_ = 0; \
8977 assert(kind != PyUnicode_WCHAR_KIND); \
8978 switch ((kind)) { \
8979 case PyUnicode_1BYTE_KIND: { \
8980 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8981 memset(to_, (unsigned char)value, length); \
8982 break; \
8983 } \
8984 case PyUnicode_2BYTE_KIND: { \
8985 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8986 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8987 break; \
8988 } \
8989 default: { \
8990 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8991 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8992 break; \
8993 } \
8994 } \
8995 } while (0)
8996
Alexander Belopolsky40018472011-02-26 01:02:56 +00008997static PyUnicodeObject *
8998pad(PyUnicodeObject *self,
8999 Py_ssize_t left,
9000 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 PyObject *u;
9004 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009005 int kind;
9006 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007
9008 if (left < 0)
9009 left = 0;
9010 if (right < 0)
9011 right = 0;
9012
Tim Peters7a29bd52001-09-12 03:03:31 +00009013 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 Py_INCREF(self);
9015 return self;
9016 }
9017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9019 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009020 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9021 return NULL;
9022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9024 if (fill > maxchar)
9025 maxchar = fill;
9026 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009027 if (!u)
9028 return NULL;
9029
9030 kind = PyUnicode_KIND(u);
9031 data = PyUnicode_DATA(u);
9032 if (left)
9033 FILL(kind, data, fill, 0, left);
9034 if (right)
9035 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009036 if (PyUnicode_CopyCharacters(u, left,
9037 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009038 _PyUnicode_LENGTH(self)) < 0)
9039 {
9040 Py_DECREF(u);
9041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
9043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
Alexander Belopolsky40018472011-02-26 01:02:56 +00009048PyObject *
9049PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052
9053 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 switch(PyUnicode_KIND(string)) {
9058 case PyUnicode_1BYTE_KIND:
9059 list = ucs1lib_splitlines(
9060 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9061 PyUnicode_GET_LENGTH(string), keepends);
9062 break;
9063 case PyUnicode_2BYTE_KIND:
9064 list = ucs2lib_splitlines(
9065 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9066 PyUnicode_GET_LENGTH(string), keepends);
9067 break;
9068 case PyUnicode_4BYTE_KIND:
9069 list = ucs4lib_splitlines(
9070 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9071 PyUnicode_GET_LENGTH(string), keepends);
9072 break;
9073 default:
9074 assert(0);
9075 list = 0;
9076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 Py_DECREF(string);
9078 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079}
9080
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081static PyObject *
9082split(PyUnicodeObject *self,
9083 PyUnicodeObject *substring,
9084 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 int kind1, kind2, kind;
9087 void *buf1, *buf2;
9088 Py_ssize_t len1, len2;
9089 PyObject* out;
9090
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009092 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 if (PyUnicode_READY(self) == -1)
9095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 if (substring == NULL)
9098 switch(PyUnicode_KIND(self)) {
9099 case PyUnicode_1BYTE_KIND:
9100 return ucs1lib_split_whitespace(
9101 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9102 PyUnicode_GET_LENGTH(self), maxcount
9103 );
9104 case PyUnicode_2BYTE_KIND:
9105 return ucs2lib_split_whitespace(
9106 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9107 PyUnicode_GET_LENGTH(self), maxcount
9108 );
9109 case PyUnicode_4BYTE_KIND:
9110 return ucs4lib_split_whitespace(
9111 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9112 PyUnicode_GET_LENGTH(self), maxcount
9113 );
9114 default:
9115 assert(0);
9116 return NULL;
9117 }
9118
9119 if (PyUnicode_READY(substring) == -1)
9120 return NULL;
9121
9122 kind1 = PyUnicode_KIND(self);
9123 kind2 = PyUnicode_KIND(substring);
9124 kind = kind1 > kind2 ? kind1 : kind2;
9125 buf1 = PyUnicode_DATA(self);
9126 buf2 = PyUnicode_DATA(substring);
9127 if (kind1 != kind)
9128 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9129 if (!buf1)
9130 return NULL;
9131 if (kind2 != kind)
9132 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9133 if (!buf2) {
9134 if (kind1 != kind) PyMem_Free(buf1);
9135 return NULL;
9136 }
9137 len1 = PyUnicode_GET_LENGTH(self);
9138 len2 = PyUnicode_GET_LENGTH(substring);
9139
9140 switch(kind) {
9141 case PyUnicode_1BYTE_KIND:
9142 out = ucs1lib_split(
9143 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9144 break;
9145 case PyUnicode_2BYTE_KIND:
9146 out = ucs2lib_split(
9147 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9148 break;
9149 case PyUnicode_4BYTE_KIND:
9150 out = ucs4lib_split(
9151 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9152 break;
9153 default:
9154 out = NULL;
9155 }
9156 if (kind1 != kind)
9157 PyMem_Free(buf1);
9158 if (kind2 != kind)
9159 PyMem_Free(buf2);
9160 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161}
9162
Alexander Belopolsky40018472011-02-26 01:02:56 +00009163static PyObject *
9164rsplit(PyUnicodeObject *self,
9165 PyUnicodeObject *substring,
9166 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 int kind1, kind2, kind;
9169 void *buf1, *buf2;
9170 Py_ssize_t len1, len2;
9171 PyObject* out;
9172
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009173 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009174 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (PyUnicode_READY(self) == -1)
9177 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 if (substring == NULL)
9180 switch(PyUnicode_KIND(self)) {
9181 case PyUnicode_1BYTE_KIND:
9182 return ucs1lib_rsplit_whitespace(
9183 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9184 PyUnicode_GET_LENGTH(self), maxcount
9185 );
9186 case PyUnicode_2BYTE_KIND:
9187 return ucs2lib_rsplit_whitespace(
9188 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9189 PyUnicode_GET_LENGTH(self), maxcount
9190 );
9191 case PyUnicode_4BYTE_KIND:
9192 return ucs4lib_rsplit_whitespace(
9193 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9194 PyUnicode_GET_LENGTH(self), maxcount
9195 );
9196 default:
9197 assert(0);
9198 return NULL;
9199 }
9200
9201 if (PyUnicode_READY(substring) == -1)
9202 return NULL;
9203
9204 kind1 = PyUnicode_KIND(self);
9205 kind2 = PyUnicode_KIND(substring);
9206 kind = kind1 > kind2 ? kind1 : kind2;
9207 buf1 = PyUnicode_DATA(self);
9208 buf2 = PyUnicode_DATA(substring);
9209 if (kind1 != kind)
9210 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9211 if (!buf1)
9212 return NULL;
9213 if (kind2 != kind)
9214 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9215 if (!buf2) {
9216 if (kind1 != kind) PyMem_Free(buf1);
9217 return NULL;
9218 }
9219 len1 = PyUnicode_GET_LENGTH(self);
9220 len2 = PyUnicode_GET_LENGTH(substring);
9221
9222 switch(kind) {
9223 case PyUnicode_1BYTE_KIND:
9224 out = ucs1lib_rsplit(
9225 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9226 break;
9227 case PyUnicode_2BYTE_KIND:
9228 out = ucs2lib_rsplit(
9229 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9230 break;
9231 case PyUnicode_4BYTE_KIND:
9232 out = ucs4lib_rsplit(
9233 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9234 break;
9235 default:
9236 out = NULL;
9237 }
9238 if (kind1 != kind)
9239 PyMem_Free(buf1);
9240 if (kind2 != kind)
9241 PyMem_Free(buf2);
9242 return out;
9243}
9244
9245static Py_ssize_t
9246anylib_find(int kind, void *buf1, Py_ssize_t len1,
9247 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9248{
9249 switch(kind) {
9250 case PyUnicode_1BYTE_KIND:
9251 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9252 case PyUnicode_2BYTE_KIND:
9253 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9254 case PyUnicode_4BYTE_KIND:
9255 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9256 }
9257 assert(0);
9258 return -1;
9259}
9260
9261static Py_ssize_t
9262anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9263 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9264{
9265 switch(kind) {
9266 case PyUnicode_1BYTE_KIND:
9267 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9268 case PyUnicode_2BYTE_KIND:
9269 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9270 case PyUnicode_4BYTE_KIND:
9271 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9272 }
9273 assert(0);
9274 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009275}
9276
Alexander Belopolsky40018472011-02-26 01:02:56 +00009277static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278replace(PyObject *self, PyObject *str1,
9279 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 PyObject *u;
9282 char *sbuf = PyUnicode_DATA(self);
9283 char *buf1 = PyUnicode_DATA(str1);
9284 char *buf2 = PyUnicode_DATA(str2);
9285 int srelease = 0, release1 = 0, release2 = 0;
9286 int skind = PyUnicode_KIND(self);
9287 int kind1 = PyUnicode_KIND(str1);
9288 int kind2 = PyUnicode_KIND(str2);
9289 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9290 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9291 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292
9293 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009296 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (skind < kind1)
9299 /* substring too wide to be present */
9300 goto nothing;
9301
9302 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009303 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009304 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009306 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009308 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 Py_UCS4 u1, u2, maxchar;
9310 int mayshrink, rkind;
9311 u1 = PyUnicode_READ_CHAR(str1, 0);
9312 if (!findchar(sbuf, PyUnicode_KIND(self),
9313 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009314 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 u2 = PyUnicode_READ_CHAR(str2, 0);
9316 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9317 /* Replacing u1 with u2 may cause a maxchar reduction in the
9318 result string. */
9319 mayshrink = maxchar > 127;
9320 if (u2 > maxchar) {
9321 maxchar = u2;
9322 mayshrink = 0;
9323 }
9324 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009325 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009327 if (PyUnicode_CopyCharacters(u, 0,
9328 (PyObject*)self, 0, slen) < 0)
9329 {
9330 Py_DECREF(u);
9331 return NULL;
9332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 rkind = PyUnicode_KIND(u);
9334 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9335 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009336 if (--maxcount < 0)
9337 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 if (mayshrink) {
9341 PyObject *tmp = u;
9342 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9343 PyUnicode_GET_LENGTH(tmp));
9344 Py_DECREF(tmp);
9345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 int rkind = skind;
9348 char *res;
9349 if (kind1 < rkind) {
9350 /* widen substring */
9351 buf1 = _PyUnicode_AsKind(str1, rkind);
9352 if (!buf1) goto error;
9353 release1 = 1;
9354 }
9355 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009356 if (i < 0)
9357 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 if (rkind > kind2) {
9359 /* widen replacement */
9360 buf2 = _PyUnicode_AsKind(str2, rkind);
9361 if (!buf2) goto error;
9362 release2 = 1;
9363 }
9364 else if (rkind < kind2) {
9365 /* widen self and buf1 */
9366 rkind = kind2;
9367 if (release1) PyMem_Free(buf1);
9368 sbuf = _PyUnicode_AsKind(self, rkind);
9369 if (!sbuf) goto error;
9370 srelease = 1;
9371 buf1 = _PyUnicode_AsKind(str1, rkind);
9372 if (!buf1) goto error;
9373 release1 = 1;
9374 }
9375 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9376 if (!res) {
9377 PyErr_NoMemory();
9378 goto error;
9379 }
9380 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009381 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9383 buf2,
9384 PyUnicode_KIND_SIZE(rkind, len2));
9385 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009386
9387 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9389 slen-i,
9390 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009391 if (i == -1)
9392 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9394 buf2,
9395 PyUnicode_KIND_SIZE(rkind, len2));
9396 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398
9399 u = PyUnicode_FromKindAndData(rkind, res, slen);
9400 PyMem_Free(res);
9401 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 Py_ssize_t n, i, j, ires;
9406 Py_ssize_t product, new_size;
9407 int rkind = skind;
9408 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 if (kind1 < rkind) {
9411 buf1 = _PyUnicode_AsKind(str1, rkind);
9412 if (!buf1) goto error;
9413 release1 = 1;
9414 }
9415 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009416 if (n == 0)
9417 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 if (kind2 < rkind) {
9419 buf2 = _PyUnicode_AsKind(str2, rkind);
9420 if (!buf2) goto error;
9421 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 else if (kind2 > rkind) {
9424 rkind = kind2;
9425 sbuf = _PyUnicode_AsKind(self, rkind);
9426 if (!sbuf) goto error;
9427 srelease = 1;
9428 if (release1) PyMem_Free(buf1);
9429 buf1 = _PyUnicode_AsKind(str1, rkind);
9430 if (!buf1) goto error;
9431 release1 = 1;
9432 }
9433 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9434 PyUnicode_GET_LENGTH(str1))); */
9435 product = n * (len2-len1);
9436 if ((product / (len2-len1)) != n) {
9437 PyErr_SetString(PyExc_OverflowError,
9438 "replace string is too long");
9439 goto error;
9440 }
9441 new_size = slen + product;
9442 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9443 PyErr_SetString(PyExc_OverflowError,
9444 "replace string is too long");
9445 goto error;
9446 }
9447 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9448 if (!res)
9449 goto error;
9450 ires = i = 0;
9451 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009452 while (n-- > 0) {
9453 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 j = anylib_find(rkind,
9455 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9456 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009457 if (j == -1)
9458 break;
9459 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009460 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9462 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9463 PyUnicode_KIND_SIZE(rkind, j-i));
9464 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465 }
9466 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 if (len2 > 0) {
9468 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9469 buf2,
9470 PyUnicode_KIND_SIZE(rkind, len2));
9471 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009476 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9478 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9479 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009480 } else {
9481 /* interleave */
9482 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9484 buf2,
9485 PyUnicode_KIND_SIZE(rkind, len2));
9486 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009487 if (--n <= 0)
9488 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9490 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9491 PyUnicode_KIND_SIZE(rkind, 1));
9492 ires++;
9493 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9496 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9497 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009500 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 if (srelease)
9503 PyMem_FREE(sbuf);
9504 if (release1)
9505 PyMem_FREE(buf1);
9506 if (release2)
9507 PyMem_FREE(buf2);
9508 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009511 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 if (srelease)
9513 PyMem_FREE(sbuf);
9514 if (release1)
9515 PyMem_FREE(buf1);
9516 if (release2)
9517 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009518 if (PyUnicode_CheckExact(self)) {
9519 Py_INCREF(self);
9520 return (PyObject *) self;
9521 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009522 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 error:
9524 if (srelease && sbuf)
9525 PyMem_FREE(sbuf);
9526 if (release1 && buf1)
9527 PyMem_FREE(buf1);
9528 if (release2 && buf2)
9529 PyMem_FREE(buf2);
9530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531}
9532
9533/* --- Unicode Object Methods --------------------------------------------- */
9534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009535PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537\n\
9538Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009539characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540
9541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009542unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 return fixup(self, fixtitle);
9545}
9546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009547PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549\n\
9550Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009551have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552
9553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009554unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 return fixup(self, fixcapitalize);
9557}
9558
9559#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009560PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562\n\
9563Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009564normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565
9566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009567unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
9569 PyObject *list;
9570 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009571 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 /* Split into words */
9574 list = split(self, NULL, -1);
9575 if (!list)
9576 return NULL;
9577
9578 /* Capitalize each word */
9579 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9580 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 if (item == NULL)
9583 goto onError;
9584 Py_DECREF(PyList_GET_ITEM(list, i));
9585 PyList_SET_ITEM(list, i, item);
9586 }
9587
9588 /* Join the words to form a new string */
9589 item = PyUnicode_Join(NULL, list);
9590
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592 Py_DECREF(list);
9593 return (PyObject *)item;
9594}
9595#endif
9596
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009597/* Argument converter. Coerces to a single unicode character */
9598
9599static int
9600convert_uc(PyObject *obj, void *addr)
9601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009604
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 uniobj = PyUnicode_FromObject(obj);
9606 if (uniobj == NULL) {
9607 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009609 return 0;
9610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009614 Py_DECREF(uniobj);
9615 return 0;
9616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 Py_DECREF(uniobj);
9619 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009620}
9621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009622PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009625Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009626done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627
9628static PyObject *
9629unicode_center(PyUnicodeObject *self, PyObject *args)
9630{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009631 Py_ssize_t marg, left;
9632 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 Py_UCS4 fillchar = ' ';
9634
Victor Stinnere9a29352011-10-01 02:14:59 +02009635 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
Victor Stinnere9a29352011-10-01 02:14:59 +02009638 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 return NULL;
9640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 Py_INCREF(self);
9643 return (PyObject*) self;
9644 }
9645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 left = marg / 2 + (marg & width & 1);
9648
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009649 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650}
9651
Marc-André Lemburge5034372000-08-08 08:04:29 +00009652#if 0
9653
9654/* This code should go into some future Unicode collation support
9655 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009656 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009657
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009658/* speedy UTF-16 code point order comparison */
9659/* gleaned from: */
9660/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9661
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009662static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009663{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009664 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009665 0, 0, 0, 0, 0, 0, 0, 0,
9666 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009667 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009668};
9669
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670static int
9671unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009673 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009674
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 Py_UNICODE *s1 = str1->str;
9676 Py_UNICODE *s2 = str2->str;
9677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 len1 = str1->_base._base.length;
9679 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009680
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009682 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009683
9684 c1 = *s1++;
9685 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009686
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 if (c1 > (1<<11) * 26)
9688 c1 += utf16Fixup[c1>>11];
9689 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009690 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009691 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009692
9693 if (c1 != c2)
9694 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009695
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009696 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697 }
9698
9699 return (len1 < len2) ? -1 : (len1 != len2);
9700}
9701
Marc-André Lemburge5034372000-08-08 08:04:29 +00009702#else
9703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704/* This function assumes that str1 and str2 are readied by the caller. */
9705
Marc-André Lemburge5034372000-08-08 08:04:29 +00009706static int
9707unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 int kind1, kind2;
9710 void *data1, *data2;
9711 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 kind1 = PyUnicode_KIND(str1);
9714 kind2 = PyUnicode_KIND(str2);
9715 data1 = PyUnicode_DATA(str1);
9716 data2 = PyUnicode_DATA(str2);
9717 len1 = PyUnicode_GET_LENGTH(str1);
9718 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 for (i = 0; i < len1 && i < len2; ++i) {
9721 Py_UCS4 c1, c2;
9722 c1 = PyUnicode_READ(kind1, data1, i);
9723 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009724
9725 if (c1 != c2)
9726 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009727 }
9728
9729 return (len1 < len2) ? -1 : (len1 != len2);
9730}
9731
9732#endif
9733
Alexander Belopolsky40018472011-02-26 01:02:56 +00009734int
9735PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9738 if (PyUnicode_READY(left) == -1 ||
9739 PyUnicode_READY(right) == -1)
9740 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009741 return unicode_compare((PyUnicodeObject *)left,
9742 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009744 PyErr_Format(PyExc_TypeError,
9745 "Can't compare %.100s and %.100s",
9746 left->ob_type->tp_name,
9747 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748 return -1;
9749}
9750
Martin v. Löwis5b222132007-06-10 09:51:05 +00009751int
9752PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 Py_ssize_t i;
9755 int kind;
9756 void *data;
9757 Py_UCS4 chr;
9758
Victor Stinner910337b2011-10-03 03:20:16 +02009759 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 if (PyUnicode_READY(uni) == -1)
9761 return -1;
9762 kind = PyUnicode_KIND(uni);
9763 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009764 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9766 if (chr != str[i])
9767 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009768 /* This check keeps Python strings that end in '\0' from comparing equal
9769 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009771 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009772 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009773 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009774 return 0;
9775}
9776
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009777
Benjamin Peterson29060642009-01-31 22:14:21 +00009778#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009779 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009780
Alexander Belopolsky40018472011-02-26 01:02:56 +00009781PyObject *
9782PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009783{
9784 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009785
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009786 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9787 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 if (PyUnicode_READY(left) == -1 ||
9789 PyUnicode_READY(right) == -1)
9790 return NULL;
9791 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9792 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009793 if (op == Py_EQ) {
9794 Py_INCREF(Py_False);
9795 return Py_False;
9796 }
9797 if (op == Py_NE) {
9798 Py_INCREF(Py_True);
9799 return Py_True;
9800 }
9801 }
9802 if (left == right)
9803 result = 0;
9804 else
9805 result = unicode_compare((PyUnicodeObject *)left,
9806 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009808 /* Convert the return value to a Boolean */
9809 switch (op) {
9810 case Py_EQ:
9811 v = TEST_COND(result == 0);
9812 break;
9813 case Py_NE:
9814 v = TEST_COND(result != 0);
9815 break;
9816 case Py_LE:
9817 v = TEST_COND(result <= 0);
9818 break;
9819 case Py_GE:
9820 v = TEST_COND(result >= 0);
9821 break;
9822 case Py_LT:
9823 v = TEST_COND(result == -1);
9824 break;
9825 case Py_GT:
9826 v = TEST_COND(result == 1);
9827 break;
9828 default:
9829 PyErr_BadArgument();
9830 return NULL;
9831 }
9832 Py_INCREF(v);
9833 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835
Brian Curtindfc80e32011-08-10 20:28:54 -05009836 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009837}
9838
Alexander Belopolsky40018472011-02-26 01:02:56 +00009839int
9840PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009841{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009842 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 int kind1, kind2, kind;
9844 void *buf1, *buf2;
9845 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009846 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009847
9848 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009849 sub = PyUnicode_FromObject(element);
9850 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009851 PyErr_Format(PyExc_TypeError,
9852 "'in <string>' requires string as left operand, not %s",
9853 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009854 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 if (PyUnicode_READY(sub) == -1)
9857 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009858
Thomas Wouters477c8d52006-05-27 19:21:47 +00009859 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009860 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009861 Py_DECREF(sub);
9862 return -1;
9863 }
9864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 kind1 = PyUnicode_KIND(str);
9866 kind2 = PyUnicode_KIND(sub);
9867 kind = kind1 > kind2 ? kind1 : kind2;
9868 buf1 = PyUnicode_DATA(str);
9869 buf2 = PyUnicode_DATA(sub);
9870 if (kind1 != kind)
9871 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9872 if (!buf1) {
9873 Py_DECREF(sub);
9874 return -1;
9875 }
9876 if (kind2 != kind)
9877 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9878 if (!buf2) {
9879 Py_DECREF(sub);
9880 if (kind1 != kind) PyMem_Free(buf1);
9881 return -1;
9882 }
9883 len1 = PyUnicode_GET_LENGTH(str);
9884 len2 = PyUnicode_GET_LENGTH(sub);
9885
9886 switch(kind) {
9887 case PyUnicode_1BYTE_KIND:
9888 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9889 break;
9890 case PyUnicode_2BYTE_KIND:
9891 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9892 break;
9893 case PyUnicode_4BYTE_KIND:
9894 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9895 break;
9896 default:
9897 result = -1;
9898 assert(0);
9899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900
9901 Py_DECREF(str);
9902 Py_DECREF(sub);
9903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (kind1 != kind)
9905 PyMem_Free(buf1);
9906 if (kind2 != kind)
9907 PyMem_Free(buf2);
9908
Guido van Rossum403d68b2000-03-13 15:55:09 +00009909 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009910}
9911
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912/* Concat to string or Unicode object giving a new Unicode object. */
9913
Alexander Belopolsky40018472011-02-26 01:02:56 +00009914PyObject *
9915PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 PyObject *u = NULL, *v = NULL, *w;
9918 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919
9920 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927
9928 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009929 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009933 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009934 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 }
9937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009939 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 w = PyUnicode_New(
9943 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9944 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009947 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9948 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009949 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009950 v, 0,
9951 PyUnicode_GET_LENGTH(v)) < 0)
9952 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953 Py_DECREF(u);
9954 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958 Py_XDECREF(u);
9959 Py_XDECREF(v);
9960 return NULL;
9961}
9962
Walter Dörwald1ab83302007-05-18 17:15:44 +00009963void
Victor Stinner23e56682011-10-03 03:54:37 +02009964PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009965{
Victor Stinner23e56682011-10-03 03:54:37 +02009966 PyObject *left, *res;
9967
9968 if (p_left == NULL) {
9969 if (!PyErr_Occurred())
9970 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009971 return;
9972 }
Victor Stinner23e56682011-10-03 03:54:37 +02009973 left = *p_left;
9974 if (right == NULL || !PyUnicode_Check(left)) {
9975 if (!PyErr_Occurred())
9976 PyErr_BadInternalCall();
9977 goto error;
9978 }
9979
9980 if (PyUnicode_CheckExact(left) && left != unicode_empty
9981 && PyUnicode_CheckExact(right) && right != unicode_empty
9982 && unicode_resizable(left)
9983 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9984 || _PyUnicode_WSTR(left) != NULL))
9985 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009986 Py_ssize_t left_len, right_len, new_len;
9987#ifdef Py_DEBUG
9988 Py_ssize_t copied;
9989#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009990
Victor Stinner23e56682011-10-03 03:54:37 +02009991 if (PyUnicode_READY(left))
9992 goto error;
9993 if (PyUnicode_READY(right))
9994 goto error;
9995
9996 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9997 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9998 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009999 left_len = PyUnicode_GET_LENGTH(left);
10000 right_len = PyUnicode_GET_LENGTH(right);
10001 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +020010002 PyErr_SetString(PyExc_OverflowError,
10003 "strings are too large to concat");
10004 goto error;
10005 }
Victor Stinnerb8038952011-10-03 23:27:56 +020010006 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010007
10008 /* Now we own the last reference to 'left', so we can resize it
10009 * in-place.
10010 */
10011 if (unicode_resize(&left, new_len) != 0) {
10012 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10013 * deallocated so it cannot be put back into
10014 * 'variable'. The MemoryError is raised when there
10015 * is no value in 'variable', which might (very
10016 * remotely) be a cause of incompatibilities.
10017 */
10018 goto error;
10019 }
10020 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +020010021#ifdef Py_DEBUG
10022 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +020010023 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +020010024 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010025 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +020010026#else
10027 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10028#endif
Victor Stinner23e56682011-10-03 03:54:37 +020010029 *p_left = left;
10030 return;
10031 }
10032 }
10033
10034 res = PyUnicode_Concat(left, right);
10035 if (res == NULL)
10036 goto error;
10037 Py_DECREF(left);
10038 *p_left = res;
10039 return;
10040
10041error:
10042 Py_DECREF(*p_left);
10043 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010044}
10045
10046void
10047PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010049 PyUnicode_Append(pleft, right);
10050 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010051}
10052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010053PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010054 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010056Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010057string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010058interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059
10060static PyObject *
10061unicode_count(PyUnicodeObject *self, PyObject *args)
10062{
10063 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010064 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010065 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 int kind1, kind2, kind;
10068 void *buf1, *buf2;
10069 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Jesus Ceaac451502011-04-20 17:09:23 +020010071 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10072 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 kind1 = PyUnicode_KIND(self);
10076 kind2 = PyUnicode_KIND(substring);
10077 kind = kind1 > kind2 ? kind1 : kind2;
10078 buf1 = PyUnicode_DATA(self);
10079 buf2 = PyUnicode_DATA(substring);
10080 if (kind1 != kind)
10081 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10082 if (!buf1) {
10083 Py_DECREF(substring);
10084 return NULL;
10085 }
10086 if (kind2 != kind)
10087 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10088 if (!buf2) {
10089 Py_DECREF(substring);
10090 if (kind1 != kind) PyMem_Free(buf1);
10091 return NULL;
10092 }
10093 len1 = PyUnicode_GET_LENGTH(self);
10094 len2 = PyUnicode_GET_LENGTH(substring);
10095
10096 ADJUST_INDICES(start, end, len1);
10097 switch(kind) {
10098 case PyUnicode_1BYTE_KIND:
10099 iresult = ucs1lib_count(
10100 ((Py_UCS1*)buf1) + start, end - start,
10101 buf2, len2, PY_SSIZE_T_MAX
10102 );
10103 break;
10104 case PyUnicode_2BYTE_KIND:
10105 iresult = ucs2lib_count(
10106 ((Py_UCS2*)buf1) + start, end - start,
10107 buf2, len2, PY_SSIZE_T_MAX
10108 );
10109 break;
10110 case PyUnicode_4BYTE_KIND:
10111 iresult = ucs4lib_count(
10112 ((Py_UCS4*)buf1) + start, end - start,
10113 buf2, len2, PY_SSIZE_T_MAX
10114 );
10115 break;
10116 default:
10117 assert(0); iresult = 0;
10118 }
10119
10120 result = PyLong_FromSsize_t(iresult);
10121
10122 if (kind1 != kind)
10123 PyMem_Free(buf1);
10124 if (kind2 != kind)
10125 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126
10127 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010128
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129 return result;
10130}
10131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010132PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010133 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010135Encode S using the codec registered for encoding. Default encoding\n\
10136is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010137handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010138a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10139'xmlcharrefreplace' as well as any other name registered with\n\
10140codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010143unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010145 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 char *encoding = NULL;
10147 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010148
Benjamin Peterson308d6372009-09-18 21:42:35 +000010149 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10150 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010152 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010153}
10154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010155PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157\n\
10158Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161static PyObject*
10162unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10163{
10164 Py_UNICODE *e;
10165 Py_UNICODE *p;
10166 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010167 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169 PyUnicodeObject *u;
10170 int tabsize = 8;
10171
10172 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10176 return NULL;
10177
Thomas Wouters7e474022000-07-16 12:04:32 +000010178 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010179 i = 0; /* chars up to and including most recent \n or \r */
10180 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10182 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010184 if (tabsize > 0) {
10185 incr = tabsize - (j % tabsize); /* cannot overflow */
10186 if (j > PY_SSIZE_T_MAX - incr)
10187 goto overflow1;
10188 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010189 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 if (j > PY_SSIZE_T_MAX - 1)
10193 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 j++;
10195 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 if (i > PY_SSIZE_T_MAX - j)
10197 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010199 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 }
10201 }
10202
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010203 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010204 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010205
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 /* Second pass: create output string and fill it */
10207 u = _PyUnicode_New(i + j);
10208 if (!u)
10209 return NULL;
10210
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010211 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 q = _PyUnicode_WSTR(u); /* next output char */
10213 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 if (tabsize > 0) {
10218 i = tabsize - (j % tabsize);
10219 j += i;
10220 while (i--) {
10221 if (q >= qe)
10222 goto overflow2;
10223 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010224 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 else {
10228 if (q >= qe)
10229 goto overflow2;
10230 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010231 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 if (*p == '\n' || *p == '\r')
10233 j = 0;
10234 }
10235
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010236 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_DECREF(u);
10238 return NULL;
10239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010241
10242 overflow2:
10243 Py_DECREF(u);
10244 overflow1:
10245 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247}
10248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010249PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251\n\
10252Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010253such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254arguments start and end are interpreted as in slice notation.\n\
10255\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010256Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257
10258static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260{
Jesus Ceaac451502011-04-20 17:09:23 +020010261 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010262 Py_ssize_t start;
10263 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010264 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
Jesus Ceaac451502011-04-20 17:09:23 +020010266 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10267 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (PyUnicode_READY(self) == -1)
10271 return NULL;
10272 if (PyUnicode_READY(substring) == -1)
10273 return NULL;
10274
10275 result = any_find_slice(
10276 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10277 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010278 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
10280 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (result == -2)
10283 return NULL;
10284
Christian Heimes217cfd12007-12-02 14:31:20 +000010285 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286}
10287
10288static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010289unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010291 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10292 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295}
10296
Guido van Rossumc2504932007-09-18 19:42:40 +000010297/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010298 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010299static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010300unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301{
Guido van Rossumc2504932007-09-18 19:42:40 +000010302 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010303 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (_PyUnicode_HASH(self) != -1)
10306 return _PyUnicode_HASH(self);
10307 if (PyUnicode_READY(self) == -1)
10308 return -1;
10309 len = PyUnicode_GET_LENGTH(self);
10310
10311 /* The hash function as a macro, gets expanded three times below. */
10312#define HASH(P) \
10313 x = (Py_uhash_t)*P << 7; \
10314 while (--len >= 0) \
10315 x = (1000003*x) ^ (Py_uhash_t)*P++;
10316
10317 switch (PyUnicode_KIND(self)) {
10318 case PyUnicode_1BYTE_KIND: {
10319 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10320 HASH(c);
10321 break;
10322 }
10323 case PyUnicode_2BYTE_KIND: {
10324 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10325 HASH(s);
10326 break;
10327 }
10328 default: {
10329 Py_UCS4 *l;
10330 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10331 "Impossible switch case in unicode_hash");
10332 l = PyUnicode_4BYTE_DATA(self);
10333 HASH(l);
10334 break;
10335 }
10336 }
10337 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10338
Guido van Rossumc2504932007-09-18 19:42:40 +000010339 if (x == -1)
10340 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010342 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010349Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350
10351static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010354 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010355 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010356 Py_ssize_t start;
10357 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
Jesus Ceaac451502011-04-20 17:09:23 +020010359 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10360 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 if (PyUnicode_READY(self) == -1)
10364 return NULL;
10365 if (PyUnicode_READY(substring) == -1)
10366 return NULL;
10367
10368 result = any_find_slice(
10369 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10370 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010371 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
10373 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (result == -2)
10376 return NULL;
10377
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 if (result < 0) {
10379 PyErr_SetString(PyExc_ValueError, "substring not found");
10380 return NULL;
10381 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010382
Christian Heimes217cfd12007-12-02 14:31:20 +000010383 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384}
10385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010386PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010389Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010390at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
10392static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010393unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 Py_ssize_t i, length;
10396 int kind;
10397 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 int cased;
10399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 if (PyUnicode_READY(self) == -1)
10401 return NULL;
10402 length = PyUnicode_GET_LENGTH(self);
10403 kind = PyUnicode_KIND(self);
10404 data = PyUnicode_DATA(self);
10405
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 if (length == 1)
10408 return PyBool_FromLong(
10409 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010413 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010414
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 for (i = 0; i < length; i++) {
10417 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010418
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10420 return PyBool_FromLong(0);
10421 else if (!cased && Py_UNICODE_ISLOWER(ch))
10422 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010424 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425}
10426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010427PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010430Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010431at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
10433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010434unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 Py_ssize_t i, length;
10437 int kind;
10438 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439 int cased;
10440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (PyUnicode_READY(self) == -1)
10442 return NULL;
10443 length = PyUnicode_GET_LENGTH(self);
10444 kind = PyUnicode_KIND(self);
10445 data = PyUnicode_DATA(self);
10446
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (length == 1)
10449 return PyBool_FromLong(
10450 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010452 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010455
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 for (i = 0; i < length; i++) {
10458 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010459
Benjamin Peterson29060642009-01-31 22:14:21 +000010460 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10461 return PyBool_FromLong(0);
10462 else if (!cased && Py_UNICODE_ISUPPER(ch))
10463 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010465 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466}
10467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010468PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010471Return True if S is a titlecased string and there is at least one\n\
10472character in S, i.e. upper- and titlecase characters may only\n\
10473follow uncased characters and lowercase characters only cased ones.\n\
10474Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010477unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 Py_ssize_t i, length;
10480 int kind;
10481 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482 int cased, previous_is_cased;
10483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (PyUnicode_READY(self) == -1)
10485 return NULL;
10486 length = PyUnicode_GET_LENGTH(self);
10487 kind = PyUnicode_KIND(self);
10488 data = PyUnicode_DATA(self);
10489
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (length == 1) {
10492 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10493 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10494 (Py_UNICODE_ISUPPER(ch) != 0));
10495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010497 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010500
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 cased = 0;
10502 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 for (i = 0; i < length; i++) {
10504 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010505
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10507 if (previous_is_cased)
10508 return PyBool_FromLong(0);
10509 previous_is_cased = 1;
10510 cased = 1;
10511 }
10512 else if (Py_UNICODE_ISLOWER(ch)) {
10513 if (!previous_is_cased)
10514 return PyBool_FromLong(0);
10515 previous_is_cased = 1;
10516 cased = 1;
10517 }
10518 else
10519 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010521 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010524PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010525 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010527Return True if all characters in S are whitespace\n\
10528and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
10530static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010531unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 Py_ssize_t i, length;
10534 int kind;
10535 void *data;
10536
10537 if (PyUnicode_READY(self) == -1)
10538 return NULL;
10539 length = PyUnicode_GET_LENGTH(self);
10540 kind = PyUnicode_KIND(self);
10541 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (length == 1)
10545 return PyBool_FromLong(
10546 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010548 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 for (i = 0; i < length; i++) {
10553 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010554 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010557 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558}
10559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010562\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010563Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010564and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010565
10566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010567unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 Py_ssize_t i, length;
10570 int kind;
10571 void *data;
10572
10573 if (PyUnicode_READY(self) == -1)
10574 return NULL;
10575 length = PyUnicode_GET_LENGTH(self);
10576 kind = PyUnicode_KIND(self);
10577 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010578
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010579 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (length == 1)
10581 return PyBool_FromLong(
10582 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010583
10584 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010586 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 for (i = 0; i < length; i++) {
10589 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010592 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010593}
10594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010595PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010596 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010597\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010598Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010599and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010600
10601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010602unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 int kind;
10605 void *data;
10606 Py_ssize_t len, i;
10607
10608 if (PyUnicode_READY(self) == -1)
10609 return NULL;
10610
10611 kind = PyUnicode_KIND(self);
10612 data = PyUnicode_DATA(self);
10613 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010614
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010615 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (len == 1) {
10617 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10618 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10619 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010620
10621 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010623 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 for (i = 0; i < len; i++) {
10626 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010627 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010628 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010629 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010630 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010631}
10632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010633PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010636Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010637False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638
10639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010640unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 Py_ssize_t i, length;
10643 int kind;
10644 void *data;
10645
10646 if (PyUnicode_READY(self) == -1)
10647 return NULL;
10648 length = PyUnicode_GET_LENGTH(self);
10649 kind = PyUnicode_KIND(self);
10650 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (length == 1)
10654 return PyBool_FromLong(
10655 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010657 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 for (i = 0; i < length; i++) {
10662 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010663 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010665 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666}
10667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010668PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010671Return True if all characters in S are digits\n\
10672and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673
10674static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010675unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 Py_ssize_t i, length;
10678 int kind;
10679 void *data;
10680
10681 if (PyUnicode_READY(self) == -1)
10682 return NULL;
10683 length = PyUnicode_GET_LENGTH(self);
10684 kind = PyUnicode_KIND(self);
10685 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (length == 1) {
10689 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10690 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010693 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 for (i = 0; i < length; i++) {
10698 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010701 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702}
10703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010704PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010707Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010708False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709
10710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010711unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 Py_ssize_t i, length;
10714 int kind;
10715 void *data;
10716
10717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
10719 length = PyUnicode_GET_LENGTH(self);
10720 kind = PyUnicode_KIND(self);
10721 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (length == 1)
10725 return PyBool_FromLong(
10726 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010728 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 for (i = 0; i < length; i++) {
10733 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010736 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737}
10738
Martin v. Löwis47383402007-08-15 07:32:56 +000010739int
10740PyUnicode_IsIdentifier(PyObject *self)
10741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 int kind;
10743 void *data;
10744 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010745 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (PyUnicode_READY(self) == -1) {
10748 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 }
10751
10752 /* Special case for empty strings */
10753 if (PyUnicode_GET_LENGTH(self) == 0)
10754 return 0;
10755 kind = PyUnicode_KIND(self);
10756 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010757
10758 /* PEP 3131 says that the first character must be in
10759 XID_Start and subsequent characters in XID_Continue,
10760 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010761 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010762 letters, digits, underscore). However, given the current
10763 definition of XID_Start and XID_Continue, it is sufficient
10764 to check just for these, except that _ must be allowed
10765 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010767 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010768 return 0;
10769
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010770 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010772 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010773 return 1;
10774}
10775
10776PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010778\n\
10779Return True if S is a valid identifier according\n\
10780to the language definition.");
10781
10782static PyObject*
10783unicode_isidentifier(PyObject *self)
10784{
10785 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10786}
10787
Georg Brandl559e5d72008-06-11 18:37:52 +000010788PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010790\n\
10791Return True if all characters in S are considered\n\
10792printable in repr() or S is empty, False otherwise.");
10793
10794static PyObject*
10795unicode_isprintable(PyObject *self)
10796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 Py_ssize_t i, length;
10798 int kind;
10799 void *data;
10800
10801 if (PyUnicode_READY(self) == -1)
10802 return NULL;
10803 length = PyUnicode_GET_LENGTH(self);
10804 kind = PyUnicode_KIND(self);
10805 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010806
10807 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (length == 1)
10809 return PyBool_FromLong(
10810 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 for (i = 0; i < length; i++) {
10813 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010814 Py_RETURN_FALSE;
10815 }
10816 }
10817 Py_RETURN_TRUE;
10818}
10819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010820PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010821 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822\n\
10823Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010824iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825
10826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010827unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010829 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830}
10831
Martin v. Löwis18e16552006-02-15 17:27:45 +000010832static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833unicode_length(PyUnicodeObject *self)
10834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (PyUnicode_READY(self) == -1)
10836 return -1;
10837 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838}
10839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010840PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010843Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010844done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845
10846static PyObject *
10847unicode_ljust(PyUnicodeObject *self, PyObject *args)
10848{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010849 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 Py_UCS4 fillchar = ' ';
10851
10852 if (PyUnicode_READY(self) == -1)
10853 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010854
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010855 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856 return NULL;
10857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 Py_INCREF(self);
10860 return (PyObject*) self;
10861 }
10862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864}
10865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010866PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870
10871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010872unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874 return fixup(self, fixlower);
10875}
10876
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010877#define LEFTSTRIP 0
10878#define RIGHTSTRIP 1
10879#define BOTHSTRIP 2
10880
10881/* Arrays indexed by above */
10882static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10883
10884#define STRIPNAME(i) (stripformat[i]+3)
10885
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010886/* externally visible for str.strip(unicode) */
10887PyObject *
10888_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 void *data;
10891 int kind;
10892 Py_ssize_t i, j, len;
10893 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10896 return NULL;
10897
10898 kind = PyUnicode_KIND(self);
10899 data = PyUnicode_DATA(self);
10900 len = PyUnicode_GET_LENGTH(self);
10901 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10902 PyUnicode_DATA(sepobj),
10903 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904
Benjamin Peterson14339b62009-01-31 16:36:08 +000010905 i = 0;
10906 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 while (i < len &&
10908 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 i++;
10910 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010912
Benjamin Peterson14339b62009-01-31 16:36:08 +000010913 j = len;
10914 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 do {
10916 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 } while (j >= i &&
10918 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010920 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010921
Victor Stinner12bab6d2011-10-01 01:53:49 +020010922 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923}
10924
10925PyObject*
10926PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10927{
10928 unsigned char *data;
10929 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010930 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931
Victor Stinnerde636f32011-10-01 03:55:54 +020010932 if (PyUnicode_READY(self) == -1)
10933 return NULL;
10934
10935 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10936
Victor Stinner12bab6d2011-10-01 01:53:49 +020010937 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010939 if (PyUnicode_CheckExact(self)) {
10940 Py_INCREF(self);
10941 return self;
10942 }
10943 else
10944 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 }
10946
Victor Stinner12bab6d2011-10-01 01:53:49 +020010947 length = end - start;
10948 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010949 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950
Victor Stinnerde636f32011-10-01 03:55:54 +020010951 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010952 PyErr_SetString(PyExc_IndexError, "string index out of range");
10953 return NULL;
10954 }
10955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 kind = PyUnicode_KIND(self);
10957 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010958 return PyUnicode_FromKindAndData(kind,
10959 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010960 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010964do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 int kind;
10967 void *data;
10968 Py_ssize_t len, i, j;
10969
10970 if (PyUnicode_READY(self) == -1)
10971 return NULL;
10972
10973 kind = PyUnicode_KIND(self);
10974 data = PyUnicode_DATA(self);
10975 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010976
Benjamin Peterson14339b62009-01-31 16:36:08 +000010977 i = 0;
10978 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010980 i++;
10981 }
10982 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010983
Benjamin Peterson14339b62009-01-31 16:36:08 +000010984 j = len;
10985 if (striptype != LEFTSTRIP) {
10986 do {
10987 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010989 j++;
10990 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010991
Victor Stinner12bab6d2011-10-01 01:53:49 +020010992 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993}
10994
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010995
10996static PyObject *
10997do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10998{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010999 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011000
Benjamin Peterson14339b62009-01-31 16:36:08 +000011001 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11002 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011003
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 if (sep != NULL && sep != Py_None) {
11005 if (PyUnicode_Check(sep))
11006 return _PyUnicode_XStrip(self, striptype, sep);
11007 else {
11008 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 "%s arg must be None or str",
11010 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 return NULL;
11012 }
11013 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011014
Benjamin Peterson14339b62009-01-31 16:36:08 +000011015 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011016}
11017
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011021\n\
11022Return a copy of the string S with leading and trailing\n\
11023whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011024If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011025
11026static PyObject *
11027unicode_strip(PyUnicodeObject *self, PyObject *args)
11028{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011029 if (PyTuple_GET_SIZE(args) == 0)
11030 return do_strip(self, BOTHSTRIP); /* Common case */
11031 else
11032 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011033}
11034
11035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011038\n\
11039Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011040If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011041
11042static PyObject *
11043unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11044{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011045 if (PyTuple_GET_SIZE(args) == 0)
11046 return do_strip(self, LEFTSTRIP); /* Common case */
11047 else
11048 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011049}
11050
11051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011054\n\
11055Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011056If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011057
11058static PyObject *
11059unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11060{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011061 if (PyTuple_GET_SIZE(args) == 0)
11062 return do_strip(self, RIGHTSTRIP); /* Common case */
11063 else
11064 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011065}
11066
11067
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011069unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
11071 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073
Georg Brandl222de0f2009-04-12 12:01:50 +000011074 if (len < 1) {
11075 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011076 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
Tim Peters7a29bd52001-09-12 03:03:31 +000011079 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 /* no repeat, return original string */
11081 Py_INCREF(str);
11082 return (PyObject*) str;
11083 }
Tim Peters8f422462000-09-09 06:13:41 +000011084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (PyUnicode_READY(str) == -1)
11086 return NULL;
11087
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011088 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011089 PyErr_SetString(PyExc_OverflowError,
11090 "repeated string is too long");
11091 return NULL;
11092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 if (!u)
11097 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011098 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (PyUnicode_GET_LENGTH(str) == 1) {
11101 const int kind = PyUnicode_KIND(str);
11102 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11103 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011104 if (kind == PyUnicode_1BYTE_KIND)
11105 memset(to, (unsigned char)fill_char, len);
11106 else {
11107 for (n = 0; n < len; ++n)
11108 PyUnicode_WRITE(kind, to, n, fill_char);
11109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 }
11111 else {
11112 /* number of characters copied this far */
11113 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11114 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11115 char *to = (char *) PyUnicode_DATA(u);
11116 Py_MEMCPY(to, PyUnicode_DATA(str),
11117 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 n = (done <= nchars-done) ? done : nchars-done;
11120 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 }
11124
11125 return (PyObject*) u;
11126}
11127
Alexander Belopolsky40018472011-02-26 01:02:56 +000011128PyObject *
11129PyUnicode_Replace(PyObject *obj,
11130 PyObject *subobj,
11131 PyObject *replobj,
11132 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133{
11134 PyObject *self;
11135 PyObject *str1;
11136 PyObject *str2;
11137 PyObject *result;
11138
11139 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011140 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011143 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 Py_DECREF(self);
11145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146 }
11147 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011148 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 Py_DECREF(self);
11150 Py_DECREF(str1);
11151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 Py_DECREF(self);
11155 Py_DECREF(str1);
11156 Py_DECREF(str2);
11157 return result;
11158}
11159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011160PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011161 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162\n\
11163Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011164old replaced by new. If the optional argument count is\n\
11165given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
11167static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 PyObject *str1;
11171 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011172 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 PyObject *result;
11174
Martin v. Löwis18e16552006-02-15 17:27:45 +000011175 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 str1 = PyUnicode_FromObject(str1);
11180 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11181 return NULL;
11182 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011183 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 Py_DECREF(str1);
11185 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188 result = replace(self, str1, str2, maxcount);
11189
11190 Py_DECREF(str1);
11191 Py_DECREF(str2);
11192 return result;
11193}
11194
Alexander Belopolsky40018472011-02-26 01:02:56 +000011195static PyObject *
11196unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011198 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 Py_ssize_t isize;
11200 Py_ssize_t osize, squote, dquote, i, o;
11201 Py_UCS4 max, quote;
11202 int ikind, okind;
11203 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011206 return NULL;
11207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 isize = PyUnicode_GET_LENGTH(unicode);
11209 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 /* Compute length of output, quote characters, and
11212 maximum character */
11213 osize = 2; /* quotes */
11214 max = 127;
11215 squote = dquote = 0;
11216 ikind = PyUnicode_KIND(unicode);
11217 for (i = 0; i < isize; i++) {
11218 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11219 switch (ch) {
11220 case '\'': squote++; osize++; break;
11221 case '"': dquote++; osize++; break;
11222 case '\\': case '\t': case '\r': case '\n':
11223 osize += 2; break;
11224 default:
11225 /* Fast-path ASCII */
11226 if (ch < ' ' || ch == 0x7f)
11227 osize += 4; /* \xHH */
11228 else if (ch < 0x7f)
11229 osize++;
11230 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11231 osize++;
11232 max = ch > max ? ch : max;
11233 }
11234 else if (ch < 0x100)
11235 osize += 4; /* \xHH */
11236 else if (ch < 0x10000)
11237 osize += 6; /* \uHHHH */
11238 else
11239 osize += 10; /* \uHHHHHHHH */
11240 }
11241 }
11242
11243 quote = '\'';
11244 if (squote) {
11245 if (dquote)
11246 /* Both squote and dquote present. Use squote,
11247 and escape them */
11248 osize += squote;
11249 else
11250 quote = '"';
11251 }
11252
11253 repr = PyUnicode_New(osize, max);
11254 if (repr == NULL)
11255 return NULL;
11256 okind = PyUnicode_KIND(repr);
11257 odata = PyUnicode_DATA(repr);
11258
11259 PyUnicode_WRITE(okind, odata, 0, quote);
11260 PyUnicode_WRITE(okind, odata, osize-1, quote);
11261
11262 for (i = 0, o = 1; i < isize; i++) {
11263 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011264
11265 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if ((ch == quote) || (ch == '\\')) {
11267 PyUnicode_WRITE(okind, odata, o++, '\\');
11268 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011269 continue;
11270 }
11271
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011273 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 PyUnicode_WRITE(okind, odata, o++, '\\');
11275 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011276 }
11277 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 PyUnicode_WRITE(okind, odata, o++, '\\');
11279 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011280 }
11281 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 PyUnicode_WRITE(okind, odata, o++, '\\');
11283 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011284 }
11285
11286 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011287 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 PyUnicode_WRITE(okind, odata, o++, '\\');
11289 PyUnicode_WRITE(okind, odata, o++, 'x');
11290 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11291 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011292 }
11293
Georg Brandl559e5d72008-06-11 18:37:52 +000011294 /* Copy ASCII characters as-is */
11295 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011297 }
11298
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011300 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011301 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011302 (categories Z* and C* except ASCII space)
11303 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011305 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (ch <= 0xff) {
11307 PyUnicode_WRITE(okind, odata, o++, '\\');
11308 PyUnicode_WRITE(okind, odata, o++, 'x');
11309 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11310 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011311 }
11312 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 else if (ch >= 0x10000) {
11314 PyUnicode_WRITE(okind, odata, o++, '\\');
11315 PyUnicode_WRITE(okind, odata, o++, 'U');
11316 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11317 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11318 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11319 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11320 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11321 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11322 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11323 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011324 }
11325 /* Map 16-bit characters to '\uxxxx' */
11326 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 PyUnicode_WRITE(okind, odata, o++, '\\');
11328 PyUnicode_WRITE(okind, odata, o++, 'u');
11329 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11330 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11331 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11332 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011333 }
11334 }
11335 /* Copy characters as-is */
11336 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011338 }
11339 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011342 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343}
11344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011345PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347\n\
11348Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011349such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350arguments start and end are interpreted as in slice notation.\n\
11351\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
11354static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356{
Jesus Ceaac451502011-04-20 17:09:23 +020011357 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011358 Py_ssize_t start;
11359 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011360 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Jesus Ceaac451502011-04-20 17:09:23 +020011362 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11363 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 if (PyUnicode_READY(self) == -1)
11367 return NULL;
11368 if (PyUnicode_READY(substring) == -1)
11369 return NULL;
11370
11371 result = any_find_slice(
11372 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11373 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011374 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
11376 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (result == -2)
11379 return NULL;
11380
Christian Heimes217cfd12007-12-02 14:31:20 +000011381 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388
11389static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391{
Jesus Ceaac451502011-04-20 17:09:23 +020011392 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011393 Py_ssize_t start;
11394 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011395 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
Jesus Ceaac451502011-04-20 17:09:23 +020011397 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11398 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 if (PyUnicode_READY(self) == -1)
11402 return NULL;
11403 if (PyUnicode_READY(substring) == -1)
11404 return NULL;
11405
11406 result = any_find_slice(
11407 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11408 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011409 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
11411 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (result == -2)
11414 return NULL;
11415
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 if (result < 0) {
11417 PyErr_SetString(PyExc_ValueError, "substring not found");
11418 return NULL;
11419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420
Christian Heimes217cfd12007-12-02 14:31:20 +000011421 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422}
11423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011427Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011428done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
11430static PyObject *
11431unicode_rjust(PyUnicodeObject *self, PyObject *args)
11432{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011433 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 Py_UCS4 fillchar = ' ';
11435
Victor Stinnere9a29352011-10-01 02:14:59 +020011436 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011438
Victor Stinnere9a29352011-10-01 02:14:59 +020011439 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 return NULL;
11441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443 Py_INCREF(self);
11444 return (PyObject*) self;
11445 }
11446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448}
11449
Alexander Belopolsky40018472011-02-26 01:02:56 +000011450PyObject *
11451PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452{
11453 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011454
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 s = PyUnicode_FromObject(s);
11456 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011457 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 if (sep != NULL) {
11459 sep = PyUnicode_FromObject(sep);
11460 if (sep == NULL) {
11461 Py_DECREF(s);
11462 return NULL;
11463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 }
11465
11466 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11467
11468 Py_DECREF(s);
11469 Py_XDECREF(sep);
11470 return result;
11471}
11472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475\n\
11476Return a list of the words in S, using sep as the\n\
11477delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011478splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011479whitespace string is a separator and empty strings are\n\
11480removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482static PyObject*
11483unicode_split(PyUnicodeObject *self, PyObject *args)
11484{
11485 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011486 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Martin v. Löwis18e16552006-02-15 17:27:45 +000011488 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 return NULL;
11490
11491 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497}
11498
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499PyObject *
11500PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11501{
11502 PyObject* str_obj;
11503 PyObject* sep_obj;
11504 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 int kind1, kind2, kind;
11506 void *buf1 = NULL, *buf2 = NULL;
11507 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011508
11509 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011510 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011512 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011514 Py_DECREF(str_obj);
11515 return NULL;
11516 }
11517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 kind1 = PyUnicode_KIND(str_in);
11519 kind2 = PyUnicode_KIND(sep_obj);
11520 kind = kind1 > kind2 ? kind1 : kind2;
11521 buf1 = PyUnicode_DATA(str_in);
11522 if (kind1 != kind)
11523 buf1 = _PyUnicode_AsKind(str_in, kind);
11524 if (!buf1)
11525 goto onError;
11526 buf2 = PyUnicode_DATA(sep_obj);
11527 if (kind2 != kind)
11528 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11529 if (!buf2)
11530 goto onError;
11531 len1 = PyUnicode_GET_LENGTH(str_obj);
11532 len2 = PyUnicode_GET_LENGTH(sep_obj);
11533
11534 switch(PyUnicode_KIND(str_in)) {
11535 case PyUnicode_1BYTE_KIND:
11536 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11537 break;
11538 case PyUnicode_2BYTE_KIND:
11539 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11540 break;
11541 case PyUnicode_4BYTE_KIND:
11542 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11543 break;
11544 default:
11545 assert(0);
11546 out = 0;
11547 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011548
11549 Py_DECREF(sep_obj);
11550 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (kind1 != kind)
11552 PyMem_Free(buf1);
11553 if (kind2 != kind)
11554 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011555
11556 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 onError:
11558 Py_DECREF(sep_obj);
11559 Py_DECREF(str_obj);
11560 if (kind1 != kind && buf1)
11561 PyMem_Free(buf1);
11562 if (kind2 != kind && buf2)
11563 PyMem_Free(buf2);
11564 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011565}
11566
11567
11568PyObject *
11569PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11570{
11571 PyObject* str_obj;
11572 PyObject* sep_obj;
11573 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 int kind1, kind2, kind;
11575 void *buf1 = NULL, *buf2 = NULL;
11576 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011577
11578 str_obj = PyUnicode_FromObject(str_in);
11579 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011581 sep_obj = PyUnicode_FromObject(sep_in);
11582 if (!sep_obj) {
11583 Py_DECREF(str_obj);
11584 return NULL;
11585 }
11586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 kind1 = PyUnicode_KIND(str_in);
11588 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011589 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 buf1 = PyUnicode_DATA(str_in);
11591 if (kind1 != kind)
11592 buf1 = _PyUnicode_AsKind(str_in, kind);
11593 if (!buf1)
11594 goto onError;
11595 buf2 = PyUnicode_DATA(sep_obj);
11596 if (kind2 != kind)
11597 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11598 if (!buf2)
11599 goto onError;
11600 len1 = PyUnicode_GET_LENGTH(str_obj);
11601 len2 = PyUnicode_GET_LENGTH(sep_obj);
11602
11603 switch(PyUnicode_KIND(str_in)) {
11604 case PyUnicode_1BYTE_KIND:
11605 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11606 break;
11607 case PyUnicode_2BYTE_KIND:
11608 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11609 break;
11610 case PyUnicode_4BYTE_KIND:
11611 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11612 break;
11613 default:
11614 assert(0);
11615 out = 0;
11616 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011617
11618 Py_DECREF(sep_obj);
11619 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (kind1 != kind)
11621 PyMem_Free(buf1);
11622 if (kind2 != kind)
11623 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011624
11625 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 onError:
11627 Py_DECREF(sep_obj);
11628 Py_DECREF(str_obj);
11629 if (kind1 != kind && buf1)
11630 PyMem_Free(buf1);
11631 if (kind2 != kind && buf2)
11632 PyMem_Free(buf2);
11633 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634}
11635
11636PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011639Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011640the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011641found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011642
11643static PyObject*
11644unicode_partition(PyUnicodeObject *self, PyObject *separator)
11645{
11646 return PyUnicode_Partition((PyObject *)self, separator);
11647}
11648
11649PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011650 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011652Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011653the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011654separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011655
11656static PyObject*
11657unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11658{
11659 return PyUnicode_RPartition((PyObject *)self, separator);
11660}
11661
Alexander Belopolsky40018472011-02-26 01:02:56 +000011662PyObject *
11663PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011664{
11665 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011666
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011667 s = PyUnicode_FromObject(s);
11668 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011669 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 if (sep != NULL) {
11671 sep = PyUnicode_FromObject(sep);
11672 if (sep == NULL) {
11673 Py_DECREF(s);
11674 return NULL;
11675 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011676 }
11677
11678 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11679
11680 Py_DECREF(s);
11681 Py_XDECREF(sep);
11682 return result;
11683}
11684
11685PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011687\n\
11688Return a list of the words in S, using sep as the\n\
11689delimiter string, starting at the end of the string and\n\
11690working to the front. If maxsplit is given, at most maxsplit\n\
11691splits are done. If sep is not specified, any whitespace string\n\
11692is a separator.");
11693
11694static PyObject*
11695unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11696{
11697 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011698 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011699
Martin v. Löwis18e16552006-02-15 17:27:45 +000011700 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011701 return NULL;
11702
11703 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011705 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011707 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011709}
11710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011711PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713\n\
11714Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011715Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011716is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
11718static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011719unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011721 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011722 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011724 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11725 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 return NULL;
11727
Guido van Rossum86662912000-04-11 15:38:46 +000011728 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729}
11730
11731static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011732PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Walter Dörwald346737f2007-05-31 10:44:43 +000011734 if (PyUnicode_CheckExact(self)) {
11735 Py_INCREF(self);
11736 return self;
11737 } else
11738 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011739 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740}
11741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744\n\
11745Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011746and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011749unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 return fixup(self, fixswapcase);
11752}
11753
Georg Brandlceee0772007-11-27 23:48:05 +000011754PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011756\n\
11757Return a translation table usable for str.translate().\n\
11758If there is only one argument, it must be a dictionary mapping Unicode\n\
11759ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011760Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011761If there are two arguments, they must be strings of equal length, and\n\
11762in the resulting dictionary, each character in x will be mapped to the\n\
11763character at the same position in y. If there is a third argument, it\n\
11764must be a string, whose characters will be mapped to None in the result.");
11765
11766static PyObject*
11767unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11768{
11769 PyObject *x, *y = NULL, *z = NULL;
11770 PyObject *new = NULL, *key, *value;
11771 Py_ssize_t i = 0;
11772 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773
Georg Brandlceee0772007-11-27 23:48:05 +000011774 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11775 return NULL;
11776 new = PyDict_New();
11777 if (!new)
11778 return NULL;
11779 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 int x_kind, y_kind, z_kind;
11781 void *x_data, *y_data, *z_data;
11782
Georg Brandlceee0772007-11-27 23:48:05 +000011783 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011784 if (!PyUnicode_Check(x)) {
11785 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11786 "be a string if there is a second argument");
11787 goto err;
11788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011790 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11791 "arguments must have equal length");
11792 goto err;
11793 }
11794 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 x_kind = PyUnicode_KIND(x);
11796 y_kind = PyUnicode_KIND(y);
11797 x_data = PyUnicode_DATA(x);
11798 y_data = PyUnicode_DATA(y);
11799 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11800 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11801 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011802 if (!key || !value)
11803 goto err;
11804 res = PyDict_SetItem(new, key, value);
11805 Py_DECREF(key);
11806 Py_DECREF(value);
11807 if (res < 0)
11808 goto err;
11809 }
11810 /* create entries for deleting chars in z */
11811 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 z_kind = PyUnicode_KIND(z);
11813 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011814 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011816 if (!key)
11817 goto err;
11818 res = PyDict_SetItem(new, key, Py_None);
11819 Py_DECREF(key);
11820 if (res < 0)
11821 goto err;
11822 }
11823 }
11824 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 int kind;
11826 void *data;
11827
Georg Brandlceee0772007-11-27 23:48:05 +000011828 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011829 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011830 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11831 "to maketrans it must be a dict");
11832 goto err;
11833 }
11834 /* copy entries into the new dict, converting string keys to int keys */
11835 while (PyDict_Next(x, &i, &key, &value)) {
11836 if (PyUnicode_Check(key)) {
11837 /* convert string keys to integer keys */
11838 PyObject *newkey;
11839 if (PyUnicode_GET_SIZE(key) != 1) {
11840 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11841 "table must be of length 1");
11842 goto err;
11843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 kind = PyUnicode_KIND(key);
11845 data = PyUnicode_DATA(key);
11846 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011847 if (!newkey)
11848 goto err;
11849 res = PyDict_SetItem(new, newkey, value);
11850 Py_DECREF(newkey);
11851 if (res < 0)
11852 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011853 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011854 /* just keep integer keys */
11855 if (PyDict_SetItem(new, key, value) < 0)
11856 goto err;
11857 } else {
11858 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11859 "be strings or integers");
11860 goto err;
11861 }
11862 }
11863 }
11864 return new;
11865 err:
11866 Py_DECREF(new);
11867 return NULL;
11868}
11869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011870PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872\n\
11873Return a copy of the string S, where all characters have been mapped\n\
11874through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011875Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011876Unmapped characters are left untouched. Characters mapped to None\n\
11877are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878
11879static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883}
11884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011885PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
11890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011891unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 return fixup(self, fixupper);
11894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011899Pad a numeric string S with zeros on the left, to fill a field\n\
11900of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
11902static PyObject *
11903unicode_zfill(PyUnicodeObject *self, PyObject *args)
11904{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011905 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011907 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 int kind;
11909 void *data;
11910 Py_UCS4 chr;
11911
11912 if (PyUnicode_READY(self) == -1)
11913 return NULL;
11914
Martin v. Löwis18e16552006-02-15 17:27:45 +000011915 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 return NULL;
11917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011919 if (PyUnicode_CheckExact(self)) {
11920 Py_INCREF(self);
11921 return (PyObject*) self;
11922 }
11923 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011924 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
11926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
11929 u = pad(self, fill, 0, '0');
11930
Walter Dörwald068325e2002-04-15 13:36:47 +000011931 if (u == NULL)
11932 return NULL;
11933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 kind = PyUnicode_KIND(u);
11935 data = PyUnicode_DATA(u);
11936 chr = PyUnicode_READ(kind, data, fill);
11937
11938 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 PyUnicode_WRITE(kind, data, 0, chr);
11941 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 }
11943
11944 return (PyObject*) u;
11945}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
11947#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011948static PyObject *
11949unicode__decimal2ascii(PyObject *self)
11950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011952}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953#endif
11954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011958Return True if S starts with the specified prefix, False otherwise.\n\
11959With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011960With optional end, stop comparing S at that position.\n\
11961prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
11963static PyObject *
11964unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011967 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011969 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011970 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011971 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Jesus Ceaac451502011-04-20 17:09:23 +020011973 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011975 if (PyTuple_Check(subobj)) {
11976 Py_ssize_t i;
11977 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11978 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011980 if (substring == NULL)
11981 return NULL;
11982 result = tailmatch(self, substring, start, end, -1);
11983 Py_DECREF(substring);
11984 if (result) {
11985 Py_RETURN_TRUE;
11986 }
11987 }
11988 /* nothing matched */
11989 Py_RETURN_FALSE;
11990 }
11991 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011992 if (substring == NULL) {
11993 if (PyErr_ExceptionMatches(PyExc_TypeError))
11994 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11995 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011997 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011998 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012000 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001}
12002
12003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012004PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012007Return True if S ends with the specified suffix, False otherwise.\n\
12008With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012009With optional end, stop comparing S at that position.\n\
12010suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
12012static PyObject *
12013unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012016 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012018 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012019 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012020 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
Jesus Ceaac451502011-04-20 17:09:23 +020012022 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012024 if (PyTuple_Check(subobj)) {
12025 Py_ssize_t i;
12026 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12027 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012029 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012031 result = tailmatch(self, substring, start, end, +1);
12032 Py_DECREF(substring);
12033 if (result) {
12034 Py_RETURN_TRUE;
12035 }
12036 }
12037 Py_RETURN_FALSE;
12038 }
12039 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012040 if (substring == NULL) {
12041 if (PyErr_ExceptionMatches(PyExc_TypeError))
12042 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12043 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012045 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012046 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012048 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049}
12050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012052
12053PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012055\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012056Return a formatted version of S, using substitutions from args and kwargs.\n\
12057The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012058
Eric Smith27bbca62010-11-04 17:06:58 +000012059PyDoc_STRVAR(format_map__doc__,
12060 "S.format_map(mapping) -> str\n\
12061\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012062Return a formatted version of S, using substitutions from mapping.\n\
12063The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012064
Eric Smith4a7d76d2008-05-30 18:10:19 +000012065static PyObject *
12066unicode__format__(PyObject* self, PyObject* args)
12067{
12068 PyObject *format_spec;
12069
12070 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12071 return NULL;
12072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12074 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012075}
12076
Eric Smith8c663262007-08-25 02:26:07 +000012077PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012079\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012080Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012081
12082static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012083unicode__sizeof__(PyUnicodeObject *v)
12084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 Py_ssize_t size;
12086
12087 /* If it's a compact object, account for base structure +
12088 character data. */
12089 if (PyUnicode_IS_COMPACT_ASCII(v))
12090 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12091 else if (PyUnicode_IS_COMPACT(v))
12092 size = sizeof(PyCompactUnicodeObject) +
12093 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12094 else {
12095 /* If it is a two-block object, account for base object, and
12096 for character block if present. */
12097 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012098 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 size += (PyUnicode_GET_LENGTH(v) + 1) *
12100 PyUnicode_CHARACTER_SIZE(v);
12101 }
12102 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012103 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012104 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012106 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012107 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108
12109 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012110}
12111
12112PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012114
12115static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012116unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012117{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012118 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (!copy)
12120 return NULL;
12121 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012122}
12123
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124static PyMethodDef unicode_methods[] = {
12125
12126 /* Order is according to common usage: often used methods should
12127 appear first, since lookup is done sequentially. */
12128
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012129 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012130 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12131 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012132 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012133 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12134 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12135 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12136 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12137 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12138 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12139 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012140 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012141 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12142 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12143 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012145 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12146 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12147 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012149 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012150 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012152 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12153 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12154 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12155 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12156 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12157 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12158 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12159 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12160 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12161 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12162 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12163 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12164 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12165 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012166 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012167 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012168 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012169 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012170 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012171 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012172 {"maketrans", (PyCFunction) unicode_maketrans,
12173 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012174 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012175#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012176 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177#endif
12178
12179#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012180 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012181 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182#endif
12183
Benjamin Peterson14339b62009-01-31 16:36:08 +000012184 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 {NULL, NULL}
12186};
12187
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012188static PyObject *
12189unicode_mod(PyObject *v, PyObject *w)
12190{
Brian Curtindfc80e32011-08-10 20:28:54 -050012191 if (!PyUnicode_Check(v))
12192 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012194}
12195
12196static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012197 0, /*nb_add*/
12198 0, /*nb_subtract*/
12199 0, /*nb_multiply*/
12200 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012201};
12202
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012204 (lenfunc) unicode_length, /* sq_length */
12205 PyUnicode_Concat, /* sq_concat */
12206 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12207 (ssizeargfunc) unicode_getitem, /* sq_item */
12208 0, /* sq_slice */
12209 0, /* sq_ass_item */
12210 0, /* sq_ass_slice */
12211 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212};
12213
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012214static PyObject*
12215unicode_subscript(PyUnicodeObject* self, PyObject* item)
12216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (PyUnicode_READY(self) == -1)
12218 return NULL;
12219
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012220 if (PyIndex_Check(item)) {
12221 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012222 if (i == -1 && PyErr_Occurred())
12223 return NULL;
12224 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012226 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012227 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012228 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012230 Py_UNICODE* result_buf;
12231 PyObject* result;
12232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012235 return NULL;
12236 }
12237
12238 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 return PyUnicode_New(0, 0);
12240 } else if (start == 0 && step == 1 &&
12241 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012242 PyUnicode_CheckExact(self)) {
12243 Py_INCREF(self);
12244 return (PyObject *)self;
12245 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012246 return PyUnicode_Substring((PyObject*)self,
12247 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012248 } else {
12249 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012250 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12251 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012252
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 if (result_buf == NULL)
12254 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012255
12256 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12257 result_buf[i] = source_buf[cur];
12258 }
Tim Petersced69f82003-09-16 20:30:58 +000012259
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012260 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012261 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012262 return result;
12263 }
12264 } else {
12265 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12266 return NULL;
12267 }
12268}
12269
12270static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012271 (lenfunc)unicode_length, /* mp_length */
12272 (binaryfunc)unicode_subscript, /* mp_subscript */
12273 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012274};
12275
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277/* Helpers for PyUnicode_Format() */
12278
12279static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012280getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012282 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 (*p_argidx)++;
12285 if (arglen < 0)
12286 return args;
12287 else
12288 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 }
12290 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 return NULL;
12293}
12294
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012295/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012297static PyObject *
12298formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012300 char *p;
12301 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 x = PyFloat_AsDouble(v);
12305 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012306 return NULL;
12307
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012310
Eric Smith0923d1d2009-04-16 20:16:10 +000012311 p = PyOS_double_to_string(x, type, prec,
12312 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012313 if (p == NULL)
12314 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012316 PyMem_Free(p);
12317 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318}
12319
Tim Peters38fd5b62000-09-21 05:43:11 +000012320static PyObject*
12321formatlong(PyObject *val, int flags, int prec, int type)
12322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 char *buf;
12324 int len;
12325 PyObject *str; /* temporary string object. */
12326 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012327
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12329 if (!str)
12330 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012332 Py_DECREF(str);
12333 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012334}
12335
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012338 size_t buflen,
12339 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012341 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012342 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (PyUnicode_GET_LENGTH(v) == 1) {
12344 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 buf[1] = '\0';
12346 return 1;
12347 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 goto onError;
12349 }
12350 else {
12351 /* Integer input truncated to a character */
12352 long x;
12353 x = PyLong_AsLong(v);
12354 if (x == -1 && PyErr_Occurred())
12355 goto onError;
12356
12357 if (x < 0 || x > 0x10ffff) {
12358 PyErr_SetString(PyExc_OverflowError,
12359 "%c arg not in range(0x110000)");
12360 return -1;
12361 }
12362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012364 buf[1] = '\0';
12365 return 1;
12366 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012367
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012369 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012371 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372}
12373
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012374/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012375 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012376*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012377#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012378
Alexander Belopolsky40018472011-02-26 01:02:56 +000012379PyObject *
12380PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 void *fmt;
12383 int fmtkind;
12384 PyObject *result;
12385 Py_UCS4 *res, *res0;
12386 Py_UCS4 max;
12387 int kind;
12388 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012392
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 PyErr_BadInternalCall();
12395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12398 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 fmt = PyUnicode_DATA(uformat);
12401 fmtkind = PyUnicode_KIND(uformat);
12402 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12403 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404
12405 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12407 if (res0 == NULL) {
12408 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411
12412 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 arglen = PyTuple_Size(args);
12414 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 }
12416 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 arglen = -1;
12418 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012420 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012421 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423
12424 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 if (--rescnt < 0) {
12427 rescnt = fmtcnt + 100;
12428 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12430 if (res0 == NULL){
12431 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012432 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 }
12434 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 }
12439 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 /* Got a format specifier */
12441 int flags = 0;
12442 Py_ssize_t width = -1;
12443 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 Py_UCS4 c = '\0';
12445 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 int isnumok;
12447 PyObject *v = NULL;
12448 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 void *pbuf;
12450 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012451 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 Py_ssize_t len, len1;
12453 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 fmtpos++;
12456 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12457 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 Py_ssize_t keylen;
12459 PyObject *key;
12460 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012461
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 if (dict == NULL) {
12463 PyErr_SetString(PyExc_TypeError,
12464 "format requires a mapping");
12465 goto onError;
12466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 /* Skip over balanced parentheses */
12471 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 if (fmtcnt < 0 || pcount > 0) {
12480 PyErr_SetString(PyExc_ValueError,
12481 "incomplete format key");
12482 goto onError;
12483 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012484 key = PyUnicode_Substring((PyObject*)uformat,
12485 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 if (key == NULL)
12487 goto onError;
12488 if (args_owned) {
12489 Py_DECREF(args);
12490 args_owned = 0;
12491 }
12492 args = PyObject_GetItem(dict, key);
12493 Py_DECREF(key);
12494 if (args == NULL) {
12495 goto onError;
12496 }
12497 args_owned = 1;
12498 arglen = -1;
12499 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 case '-': flags |= F_LJUST; continue;
12504 case '+': flags |= F_SIGN; continue;
12505 case ' ': flags |= F_BLANK; continue;
12506 case '#': flags |= F_ALT; continue;
12507 case '0': flags |= F_ZERO; continue;
12508 }
12509 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 if (c == '*') {
12512 v = getnextarg(args, arglen, &argidx);
12513 if (v == NULL)
12514 goto onError;
12515 if (!PyLong_Check(v)) {
12516 PyErr_SetString(PyExc_TypeError,
12517 "* wants int");
12518 goto onError;
12519 }
12520 width = PyLong_AsLong(v);
12521 if (width == -1 && PyErr_Occurred())
12522 goto onError;
12523 if (width < 0) {
12524 flags |= F_LJUST;
12525 width = -width;
12526 }
12527 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 }
12530 else if (c >= '0' && c <= '9') {
12531 width = c - '0';
12532 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 if (c < '0' || c > '9')
12535 break;
12536 if ((width*10) / 10 != width) {
12537 PyErr_SetString(PyExc_ValueError,
12538 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012539 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 }
12541 width = width*10 + (c - '0');
12542 }
12543 }
12544 if (c == '.') {
12545 prec = 0;
12546 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 if (c == '*') {
12549 v = getnextarg(args, arglen, &argidx);
12550 if (v == NULL)
12551 goto onError;
12552 if (!PyLong_Check(v)) {
12553 PyErr_SetString(PyExc_TypeError,
12554 "* wants int");
12555 goto onError;
12556 }
12557 prec = PyLong_AsLong(v);
12558 if (prec == -1 && PyErr_Occurred())
12559 goto onError;
12560 if (prec < 0)
12561 prec = 0;
12562 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 }
12565 else if (c >= '0' && c <= '9') {
12566 prec = c - '0';
12567 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 if (c < '0' || c > '9')
12570 break;
12571 if ((prec*10) / 10 != prec) {
12572 PyErr_SetString(PyExc_ValueError,
12573 "prec too big");
12574 goto onError;
12575 }
12576 prec = prec*10 + (c - '0');
12577 }
12578 }
12579 } /* prec */
12580 if (fmtcnt >= 0) {
12581 if (c == 'h' || c == 'l' || c == 'L') {
12582 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 }
12585 }
12586 if (fmtcnt < 0) {
12587 PyErr_SetString(PyExc_ValueError,
12588 "incomplete format");
12589 goto onError;
12590 }
12591 if (c != '%') {
12592 v = getnextarg(args, arglen, &argidx);
12593 if (v == NULL)
12594 goto onError;
12595 }
12596 sign = 0;
12597 fill = ' ';
12598 switch (c) {
12599
12600 case '%':
12601 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 len = 1;
12606 break;
12607
12608 case 's':
12609 case 'r':
12610 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012611 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 temp = v;
12613 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 }
12615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012616 if (c == 's')
12617 temp = PyObject_Str(v);
12618 else if (c == 'r')
12619 temp = PyObject_Repr(v);
12620 else
12621 temp = PyObject_ASCII(v);
12622 if (temp == NULL)
12623 goto onError;
12624 if (PyUnicode_Check(temp))
12625 /* nothing to do */;
12626 else {
12627 Py_DECREF(temp);
12628 PyErr_SetString(PyExc_TypeError,
12629 "%s argument has non-string str()");
12630 goto onError;
12631 }
12632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 if (PyUnicode_READY(temp) == -1) {
12634 Py_CLEAR(temp);
12635 goto onError;
12636 }
12637 pbuf = PyUnicode_DATA(temp);
12638 kind = PyUnicode_KIND(temp);
12639 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 if (prec >= 0 && len > prec)
12641 len = prec;
12642 break;
12643
12644 case 'i':
12645 case 'd':
12646 case 'u':
12647 case 'o':
12648 case 'x':
12649 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 isnumok = 0;
12651 if (PyNumber_Check(v)) {
12652 PyObject *iobj=NULL;
12653
12654 if (PyLong_Check(v)) {
12655 iobj = v;
12656 Py_INCREF(iobj);
12657 }
12658 else {
12659 iobj = PyNumber_Long(v);
12660 }
12661 if (iobj!=NULL) {
12662 if (PyLong_Check(iobj)) {
12663 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012664 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 Py_DECREF(iobj);
12666 if (!temp)
12667 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 if (PyUnicode_READY(temp) == -1) {
12669 Py_CLEAR(temp);
12670 goto onError;
12671 }
12672 pbuf = PyUnicode_DATA(temp);
12673 kind = PyUnicode_KIND(temp);
12674 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 sign = 1;
12676 }
12677 else {
12678 Py_DECREF(iobj);
12679 }
12680 }
12681 }
12682 if (!isnumok) {
12683 PyErr_Format(PyExc_TypeError,
12684 "%%%c format: a number is required, "
12685 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12686 goto onError;
12687 }
12688 if (flags & F_ZERO)
12689 fill = '0';
12690 break;
12691
12692 case 'e':
12693 case 'E':
12694 case 'f':
12695 case 'F':
12696 case 'g':
12697 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012698 temp = formatfloat(v, flags, prec, c);
12699 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (PyUnicode_READY(temp) == -1) {
12702 Py_CLEAR(temp);
12703 goto onError;
12704 }
12705 pbuf = PyUnicode_DATA(temp);
12706 kind = PyUnicode_KIND(temp);
12707 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 sign = 1;
12709 if (flags & F_ZERO)
12710 fill = '0';
12711 break;
12712
12713 case 'c':
12714 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012716 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 if (len < 0)
12718 goto onError;
12719 break;
12720
12721 default:
12722 PyErr_Format(PyExc_ValueError,
12723 "unsupported format character '%c' (0x%x) "
12724 "at index %zd",
12725 (31<=c && c<=126) ? (char)c : '?',
12726 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 goto onError;
12729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 /* pbuf is initialized here. */
12731 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12734 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12735 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 len--;
12737 }
12738 else if (flags & F_SIGN)
12739 sign = '+';
12740 else if (flags & F_BLANK)
12741 sign = ' ';
12742 else
12743 sign = 0;
12744 }
12745 if (width < len)
12746 width = len;
12747 if (rescnt - (sign != 0) < width) {
12748 reslen -= rescnt;
12749 rescnt = width + fmtcnt + 100;
12750 reslen += rescnt;
12751 if (reslen < 0) {
12752 Py_XDECREF(temp);
12753 PyErr_NoMemory();
12754 goto onError;
12755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12757 if (res0 == 0) {
12758 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 Py_XDECREF(temp);
12760 goto onError;
12761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 }
12764 if (sign) {
12765 if (fill != ' ')
12766 *res++ = sign;
12767 rescnt--;
12768 if (width > len)
12769 width--;
12770 }
12771 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12773 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12776 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 }
12778 rescnt -= 2;
12779 width -= 2;
12780 if (width < 0)
12781 width = 0;
12782 len -= 2;
12783 }
12784 if (width > len && !(flags & F_LJUST)) {
12785 do {
12786 --rescnt;
12787 *res++ = fill;
12788 } while (--width > len);
12789 }
12790 if (fill == ' ') {
12791 if (sign)
12792 *res++ = sign;
12793 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12795 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12796 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12797 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012798 }
12799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 /* Copy all characters, preserving len */
12801 len1 = len;
12802 while (len1--) {
12803 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12804 rescnt--;
12805 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 while (--width >= len) {
12807 --rescnt;
12808 *res++ = ' ';
12809 }
12810 if (dict && (argidx < arglen) && c != '%') {
12811 PyErr_SetString(PyExc_TypeError,
12812 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012813 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 goto onError;
12815 }
12816 Py_XDECREF(temp);
12817 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818 } /* until end */
12819 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 PyErr_SetString(PyExc_TypeError,
12821 "not all arguments converted during string formatting");
12822 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823 }
12824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825
12826 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12827 if (*res > max)
12828 max = *res;
12829 result = PyUnicode_New(reslen - rescnt, max);
12830 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 kind = PyUnicode_KIND(result);
12833 for (res = res0; res < res0+reslen-rescnt; res++)
12834 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12835 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 }
12839 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840 return (PyObject *)result;
12841
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844 Py_DECREF(uformat);
12845 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 }
12848 return NULL;
12849}
12850
Jeremy Hylton938ace62002-07-17 16:30:39 +000012851static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012852unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12853
Tim Peters6d6c1a32001-08-02 04:15:00 +000012854static PyObject *
12855unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12856{
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 static char *kwlist[] = {"object", "encoding", "errors", 0};
12859 char *encoding = NULL;
12860 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012861
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 if (type != &PyUnicode_Type)
12863 return unicode_subtype_new(type, args, kwds);
12864 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012866 return NULL;
12867 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012869 if (encoding == NULL && errors == NULL)
12870 return PyObject_Str(x);
12871 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012873}
12874
Guido van Rossume023fe02001-08-30 03:12:59 +000012875static PyObject *
12876unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12877{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012878 PyUnicodeObject *unicode, *self;
12879 Py_ssize_t length, char_size;
12880 int share_wstr, share_utf8;
12881 unsigned int kind;
12882 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012883
Benjamin Peterson14339b62009-01-31 16:36:08 +000012884 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012885
12886 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12887 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012888 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012889 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012890 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012891 return NULL;
12892
12893 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12894 if (self == NULL) {
12895 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012896 return NULL;
12897 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012898 kind = PyUnicode_KIND(unicode);
12899 length = PyUnicode_GET_LENGTH(unicode);
12900
12901 _PyUnicode_LENGTH(self) = length;
12902 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12903 _PyUnicode_STATE(self).interned = 0;
12904 _PyUnicode_STATE(self).kind = kind;
12905 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012906 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012907 _PyUnicode_STATE(self).ready = 1;
12908 _PyUnicode_WSTR(self) = NULL;
12909 _PyUnicode_UTF8_LENGTH(self) = 0;
12910 _PyUnicode_UTF8(self) = NULL;
12911 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012912 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012913
12914 share_utf8 = 0;
12915 share_wstr = 0;
12916 if (kind == PyUnicode_1BYTE_KIND) {
12917 char_size = 1;
12918 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12919 share_utf8 = 1;
12920 }
12921 else if (kind == PyUnicode_2BYTE_KIND) {
12922 char_size = 2;
12923 if (sizeof(wchar_t) == 2)
12924 share_wstr = 1;
12925 }
12926 else {
12927 assert(kind == PyUnicode_4BYTE_KIND);
12928 char_size = 4;
12929 if (sizeof(wchar_t) == 4)
12930 share_wstr = 1;
12931 }
12932
12933 /* Ensure we won't overflow the length. */
12934 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12935 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012937 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012938 data = PyObject_MALLOC((length + 1) * char_size);
12939 if (data == NULL) {
12940 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 goto onError;
12942 }
12943
Victor Stinnerc3c74152011-10-02 20:39:55 +020012944 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012945 if (share_utf8) {
12946 _PyUnicode_UTF8_LENGTH(self) = length;
12947 _PyUnicode_UTF8(self) = data;
12948 }
12949 if (share_wstr) {
12950 _PyUnicode_WSTR_LENGTH(self) = length;
12951 _PyUnicode_WSTR(self) = (wchar_t *)data;
12952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012954 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12955 PyUnicode_KIND_SIZE(kind, length + 1));
12956 Py_DECREF(unicode);
12957 return (PyObject *)self;
12958
12959onError:
12960 Py_DECREF(unicode);
12961 Py_DECREF(self);
12962 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012963}
12964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012965PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012967\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012968Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012969encoding defaults to the current default string encoding.\n\
12970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012971
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012972static PyObject *unicode_iter(PyObject *seq);
12973
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012975 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 "str", /* tp_name */
12977 sizeof(PyUnicodeObject), /* tp_size */
12978 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 (destructor)unicode_dealloc, /* tp_dealloc */
12981 0, /* tp_print */
12982 0, /* tp_getattr */
12983 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012984 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012985 unicode_repr, /* tp_repr */
12986 &unicode_as_number, /* tp_as_number */
12987 &unicode_as_sequence, /* tp_as_sequence */
12988 &unicode_as_mapping, /* tp_as_mapping */
12989 (hashfunc) unicode_hash, /* tp_hash*/
12990 0, /* tp_call*/
12991 (reprfunc) unicode_str, /* tp_str */
12992 PyObject_GenericGetAttr, /* tp_getattro */
12993 0, /* tp_setattro */
12994 0, /* tp_as_buffer */
12995 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012997 unicode_doc, /* tp_doc */
12998 0, /* tp_traverse */
12999 0, /* tp_clear */
13000 PyUnicode_RichCompare, /* tp_richcompare */
13001 0, /* tp_weaklistoffset */
13002 unicode_iter, /* tp_iter */
13003 0, /* tp_iternext */
13004 unicode_methods, /* tp_methods */
13005 0, /* tp_members */
13006 0, /* tp_getset */
13007 &PyBaseObject_Type, /* tp_base */
13008 0, /* tp_dict */
13009 0, /* tp_descr_get */
13010 0, /* tp_descr_set */
13011 0, /* tp_dictoffset */
13012 0, /* tp_init */
13013 0, /* tp_alloc */
13014 unicode_new, /* tp_new */
13015 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016};
13017
13018/* Initialize the Unicode implementation */
13019
Thomas Wouters78890102000-07-22 19:25:51 +000013020void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013022 int i;
13023
Thomas Wouters477c8d52006-05-27 19:21:47 +000013024 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013026 0x000A, /* LINE FEED */
13027 0x000D, /* CARRIAGE RETURN */
13028 0x001C, /* FILE SEPARATOR */
13029 0x001D, /* GROUP SEPARATOR */
13030 0x001E, /* RECORD SEPARATOR */
13031 0x0085, /* NEXT LINE */
13032 0x2028, /* LINE SEPARATOR */
13033 0x2029, /* PARAGRAPH SEPARATOR */
13034 };
13035
Fred Drakee4315f52000-05-09 19:53:39 +000013036 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013037 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013038 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013040
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013041 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013043 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045
13046 /* initialize the linebreak bloom filter */
13047 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013049 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013050
13051 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052}
13053
13054/* Finalize the Unicode implementation */
13055
Christian Heimesa156e092008-02-16 07:38:31 +000013056int
13057PyUnicode_ClearFreeList(void)
13058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013060}
13061
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062void
Thomas Wouters78890102000-07-22 19:25:51 +000013063_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013065 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013067 Py_XDECREF(unicode_empty);
13068 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013069
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013070 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013071 if (unicode_latin1[i]) {
13072 Py_DECREF(unicode_latin1[i]);
13073 unicode_latin1[i] = NULL;
13074 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013075 }
Christian Heimesa156e092008-02-16 07:38:31 +000013076 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013078
Walter Dörwald16807132007-05-25 13:52:07 +000013079void
13080PyUnicode_InternInPlace(PyObject **p)
13081{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13083 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013084#ifdef Py_DEBUG
13085 assert(s != NULL);
13086 assert(_PyUnicode_CHECK(s));
13087#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013089 return;
13090#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013091 /* If it's a subclass, we don't really know what putting
13092 it in the interned dict might do. */
13093 if (!PyUnicode_CheckExact(s))
13094 return;
13095 if (PyUnicode_CHECK_INTERNED(s))
13096 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013097 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013098 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 return;
13100 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013101 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013102 if (interned == NULL) {
13103 interned = PyDict_New();
13104 if (interned == NULL) {
13105 PyErr_Clear(); /* Don't leave an exception */
13106 return;
13107 }
13108 }
13109 /* It might be that the GetItem call fails even
13110 though the key is present in the dictionary,
13111 namely when this happens during a stack overflow. */
13112 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013115
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 if (t) {
13117 Py_INCREF(t);
13118 Py_DECREF(*p);
13119 *p = t;
13120 return;
13121 }
Walter Dörwald16807132007-05-25 13:52:07 +000013122
Benjamin Peterson14339b62009-01-31 16:36:08 +000013123 PyThreadState_GET()->recursion_critical = 1;
13124 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13125 PyErr_Clear();
13126 PyThreadState_GET()->recursion_critical = 0;
13127 return;
13128 }
13129 PyThreadState_GET()->recursion_critical = 0;
13130 /* The two references in interned are not counted by refcnt.
13131 The deallocator will take care of this */
13132 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013134}
13135
13136void
13137PyUnicode_InternImmortal(PyObject **p)
13138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13140
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 PyUnicode_InternInPlace(p);
13142 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 Py_INCREF(*p);
13145 }
Walter Dörwald16807132007-05-25 13:52:07 +000013146}
13147
13148PyObject *
13149PyUnicode_InternFromString(const char *cp)
13150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 PyObject *s = PyUnicode_FromString(cp);
13152 if (s == NULL)
13153 return NULL;
13154 PyUnicode_InternInPlace(&s);
13155 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013156}
13157
Alexander Belopolsky40018472011-02-26 01:02:56 +000013158void
13159_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 PyObject *keys;
13162 PyUnicodeObject *s;
13163 Py_ssize_t i, n;
13164 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013165
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 if (interned == NULL || !PyDict_Check(interned))
13167 return;
13168 keys = PyDict_Keys(interned);
13169 if (keys == NULL || !PyList_Check(keys)) {
13170 PyErr_Clear();
13171 return;
13172 }
Walter Dörwald16807132007-05-25 13:52:07 +000013173
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13175 detector, interned unicode strings are not forcibly deallocated;
13176 rather, we give them their stolen references back, and then clear
13177 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013178
Benjamin Peterson14339b62009-01-31 16:36:08 +000013179 n = PyList_GET_SIZE(keys);
13180 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013182 for (i = 0; i < n; i++) {
13183 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 if (PyUnicode_READY(s) == -1)
13185 fprintf(stderr, "could not ready string\n");
13186 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187 case SSTATE_NOT_INTERNED:
13188 /* XXX Shouldn't happen */
13189 break;
13190 case SSTATE_INTERNED_IMMORTAL:
13191 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 break;
13194 case SSTATE_INTERNED_MORTAL:
13195 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013197 break;
13198 default:
13199 Py_FatalError("Inconsistent interned string state.");
13200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 }
13203 fprintf(stderr, "total size of all interned strings: "
13204 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13205 "mortal/immortal\n", mortal_size, immortal_size);
13206 Py_DECREF(keys);
13207 PyDict_Clear(interned);
13208 Py_DECREF(interned);
13209 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013210}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013211
13212
13213/********************* Unicode Iterator **************************/
13214
13215typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013216 PyObject_HEAD
13217 Py_ssize_t it_index;
13218 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013219} unicodeiterobject;
13220
13221static void
13222unicodeiter_dealloc(unicodeiterobject *it)
13223{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 _PyObject_GC_UNTRACK(it);
13225 Py_XDECREF(it->it_seq);
13226 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013227}
13228
13229static int
13230unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013232 Py_VISIT(it->it_seq);
13233 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013234}
13235
13236static PyObject *
13237unicodeiter_next(unicodeiterobject *it)
13238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013239 PyUnicodeObject *seq;
13240 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013241
Benjamin Peterson14339b62009-01-31 16:36:08 +000013242 assert(it != NULL);
13243 seq = it->it_seq;
13244 if (seq == NULL)
13245 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013246 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13249 int kind = PyUnicode_KIND(seq);
13250 void *data = PyUnicode_DATA(seq);
13251 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13252 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 if (item != NULL)
13254 ++it->it_index;
13255 return item;
13256 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013257
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 Py_DECREF(seq);
13259 it->it_seq = NULL;
13260 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013261}
13262
13263static PyObject *
13264unicodeiter_len(unicodeiterobject *it)
13265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 Py_ssize_t len = 0;
13267 if (it->it_seq)
13268 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13269 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013270}
13271
13272PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13273
13274static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013278};
13279
13280PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13282 "str_iterator", /* tp_name */
13283 sizeof(unicodeiterobject), /* tp_basicsize */
13284 0, /* tp_itemsize */
13285 /* methods */
13286 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13287 0, /* tp_print */
13288 0, /* tp_getattr */
13289 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013290 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013291 0, /* tp_repr */
13292 0, /* tp_as_number */
13293 0, /* tp_as_sequence */
13294 0, /* tp_as_mapping */
13295 0, /* tp_hash */
13296 0, /* tp_call */
13297 0, /* tp_str */
13298 PyObject_GenericGetAttr, /* tp_getattro */
13299 0, /* tp_setattro */
13300 0, /* tp_as_buffer */
13301 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13302 0, /* tp_doc */
13303 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13304 0, /* tp_clear */
13305 0, /* tp_richcompare */
13306 0, /* tp_weaklistoffset */
13307 PyObject_SelfIter, /* tp_iter */
13308 (iternextfunc)unicodeiter_next, /* tp_iternext */
13309 unicodeiter_methods, /* tp_methods */
13310 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013311};
13312
13313static PyObject *
13314unicode_iter(PyObject *seq)
13315{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013316 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013317
Benjamin Peterson14339b62009-01-31 16:36:08 +000013318 if (!PyUnicode_Check(seq)) {
13319 PyErr_BadInternalCall();
13320 return NULL;
13321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 if (PyUnicode_READY(seq) == -1)
13323 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013324 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13325 if (it == NULL)
13326 return NULL;
13327 it->it_index = 0;
13328 Py_INCREF(seq);
13329 it->it_seq = (PyUnicodeObject *)seq;
13330 _PyObject_GC_TRACK(it);
13331 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013332}
13333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334#define UNIOP(x) Py_UNICODE_##x
13335#define UNIOP_t Py_UNICODE
13336#include "uniops.h"
13337#undef UNIOP
13338#undef UNIOP_t
13339#define UNIOP(x) Py_UCS4_##x
13340#define UNIOP_t Py_UCS4
13341#include "uniops.h"
13342#undef UNIOP
13343#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013344
Victor Stinner71133ff2010-09-01 23:43:53 +000013345Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013346PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013347{
13348 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13349 Py_UNICODE *copy;
13350 Py_ssize_t size;
13351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013352 if (!PyUnicode_Check(unicode)) {
13353 PyErr_BadArgument();
13354 return NULL;
13355 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013356 /* Ensure we won't overflow the size. */
13357 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13358 PyErr_NoMemory();
13359 return NULL;
13360 }
13361 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13362 size *= sizeof(Py_UNICODE);
13363 copy = PyMem_Malloc(size);
13364 if (copy == NULL) {
13365 PyErr_NoMemory();
13366 return NULL;
13367 }
13368 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13369 return copy;
13370}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013371
Georg Brandl66c221e2010-10-14 07:04:07 +000013372/* A _string module, to export formatter_parser and formatter_field_name_split
13373 to the string.Formatter class implemented in Python. */
13374
13375static PyMethodDef _string_methods[] = {
13376 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13377 METH_O, PyDoc_STR("split the argument as a field name")},
13378 {"formatter_parser", (PyCFunction) formatter_parser,
13379 METH_O, PyDoc_STR("parse the argument as a format string")},
13380 {NULL, NULL}
13381};
13382
13383static struct PyModuleDef _string_module = {
13384 PyModuleDef_HEAD_INIT,
13385 "_string",
13386 PyDoc_STR("string helper module"),
13387 0,
13388 _string_methods,
13389 NULL,
13390 NULL,
13391 NULL,
13392 NULL
13393};
13394
13395PyMODINIT_FUNC
13396PyInit__string(void)
13397{
13398 return PyModule_Create(&_string_module);
13399}
13400
13401
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013402#ifdef __cplusplus
13403}
13404#endif