blob: cd67f60906be32b6dd688c050005f06c452b9f31 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
972 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001018 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001019 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001020 /* slow path to check for character overflow */
1021 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1022 Py_UCS4 ch, maxchar;
1023 Py_ssize_t i;
1024
1025 maxchar = 0;
1026 invalid_kinds = 0;
1027 for (i=0; i < how_many; i++) {
1028 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1029 if (ch > maxchar) {
1030 maxchar = ch;
1031 if (maxchar > to_maxchar) {
1032 invalid_kinds = 1;
1033 break;
1034 }
1035 }
1036 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1037 }
1038 }
1039 else
1040 invalid_kinds = 1;
1041 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001042 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 "Cannot copy %s characters "
1044 "into a string of %s characters",
1045 unicode_kind_name(from),
1046 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001047 return -1;
1048 }
1049 }
1050 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051}
1052
Victor Stinner17222162011-09-28 22:15:37 +02001053/* Find the maximum code point and count the number of surrogate pairs so a
1054 correct string length can be computed before converting a string to UCS4.
1055 This function counts single surrogates as a character and not as a pair.
1056
1057 Return 0 on success, or -1 on error. */
1058static int
1059find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1060 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061{
1062 const wchar_t *iter;
1063
Victor Stinnerc53be962011-10-02 21:33:54 +02001064 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 if (num_surrogates == NULL || maxchar == NULL) {
1066 PyErr_SetString(PyExc_SystemError,
1067 "unexpected NULL arguments to "
1068 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1069 return -1;
1070 }
1071
1072 *num_surrogates = 0;
1073 *maxchar = 0;
1074
1075 for (iter = begin; iter < end; ) {
1076 if (*iter > *maxchar)
1077 *maxchar = *iter;
1078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
1518_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject *res;
1521 unsigned char max = 127;
1522 Py_ssize_t i;
1523 for (i = 0; i < size; i++) {
1524 if (u[i] & 0x80) {
1525 max = 255;
1526 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001527 }
1528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 res = PyUnicode_New(size, max);
1530 if (!res)
1531 return NULL;
1532 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1533 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001534}
1535
Victor Stinnere57b1c02011-09-28 22:20:48 +02001536static PyObject*
1537_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538{
1539 PyObject *res;
1540 Py_UCS2 max = 0;
1541 Py_ssize_t i;
1542 for (i = 0; i < size; i++)
1543 if (u[i] > max)
1544 max = u[i];
1545 res = PyUnicode_New(size, max);
1546 if (!res)
1547 return NULL;
1548 if (max >= 256)
1549 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1550 else
1551 for (i = 0; i < size; i++)
1552 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1553 return res;
1554}
1555
Victor Stinnere57b1c02011-09-28 22:20:48 +02001556static PyObject*
1557_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558{
1559 PyObject *res;
1560 Py_UCS4 max = 0;
1561 Py_ssize_t i;
1562 for (i = 0; i < size; i++)
1563 if (u[i] > max)
1564 max = u[i];
1565 res = PyUnicode_New(size, max);
1566 if (!res)
1567 return NULL;
1568 if (max >= 0x10000)
1569 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1570 else {
1571 int kind = PyUnicode_KIND(res);
1572 void *data = PyUnicode_DATA(res);
1573 for (i = 0; i < size; i++)
1574 PyUnicode_WRITE(kind, data, i, u[i]);
1575 }
1576 return res;
1577}
1578
1579PyObject*
1580PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1581{
1582 switch(kind) {
1583 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001584 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001586 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001588 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 }
Victor Stinner01698042011-10-04 00:04:26 +02001590 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 return NULL;
1592}
1593
Victor Stinner034f6cf2011-09-30 02:26:44 +02001594PyObject*
1595PyUnicode_Copy(PyObject *unicode)
1596{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001597 Py_ssize_t size;
1598 PyObject *copy;
1599 void *data;
1600
Victor Stinner034f6cf2011-09-30 02:26:44 +02001601 if (!PyUnicode_Check(unicode)) {
1602 PyErr_BadInternalCall();
1603 return NULL;
1604 }
1605 if (PyUnicode_READY(unicode))
1606 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001607
1608 size = PyUnicode_GET_LENGTH(unicode);
1609 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1610 if (!copy)
1611 return NULL;
1612 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1613
1614 data = PyUnicode_DATA(unicode);
1615 switch (PyUnicode_KIND(unicode))
1616 {
1617 case PyUnicode_1BYTE_KIND:
1618 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1619 break;
1620 case PyUnicode_2BYTE_KIND:
1621 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1622 break;
1623 case PyUnicode_4BYTE_KIND:
1624 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1625 break;
1626 default:
1627 assert(0);
1628 break;
1629 }
1630 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001631}
1632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634/* Widen Unicode objects to larger buffers. Don't write terminating null
1635 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636
1637void*
1638_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1639{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001640 Py_ssize_t len;
1641 void *result;
1642 unsigned int skind;
1643
1644 if (PyUnicode_READY(s))
1645 return NULL;
1646
1647 len = PyUnicode_GET_LENGTH(s);
1648 skind = PyUnicode_KIND(s);
1649 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001650 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return NULL;
1652 }
1653 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001654 case PyUnicode_2BYTE_KIND:
1655 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1656 if (!result)
1657 return PyErr_NoMemory();
1658 assert(skind == PyUnicode_1BYTE_KIND);
1659 _PyUnicode_CONVERT_BYTES(
1660 Py_UCS1, Py_UCS2,
1661 PyUnicode_1BYTE_DATA(s),
1662 PyUnicode_1BYTE_DATA(s) + len,
1663 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001665 case PyUnicode_4BYTE_KIND:
1666 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1667 if (!result)
1668 return PyErr_NoMemory();
1669 if (skind == PyUnicode_2BYTE_KIND) {
1670 _PyUnicode_CONVERT_BYTES(
1671 Py_UCS2, Py_UCS4,
1672 PyUnicode_2BYTE_DATA(s),
1673 PyUnicode_2BYTE_DATA(s) + len,
1674 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676 else {
1677 assert(skind == PyUnicode_1BYTE_KIND);
1678 _PyUnicode_CONVERT_BYTES(
1679 Py_UCS1, Py_UCS4,
1680 PyUnicode_1BYTE_DATA(s),
1681 PyUnicode_1BYTE_DATA(s) + len,
1682 result);
1683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001685 default:
1686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
Victor Stinner01698042011-10-04 00:04:26 +02001688 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 return NULL;
1690}
1691
1692static Py_UCS4*
1693as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1694 int copy_null)
1695{
1696 int kind;
1697 void *data;
1698 Py_ssize_t len, targetlen;
1699 if (PyUnicode_READY(string) == -1)
1700 return NULL;
1701 kind = PyUnicode_KIND(string);
1702 data = PyUnicode_DATA(string);
1703 len = PyUnicode_GET_LENGTH(string);
1704 targetlen = len;
1705 if (copy_null)
1706 targetlen++;
1707 if (!target) {
1708 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1709 PyErr_NoMemory();
1710 return NULL;
1711 }
1712 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1713 if (!target) {
1714 PyErr_NoMemory();
1715 return NULL;
1716 }
1717 }
1718 else {
1719 if (targetsize < targetlen) {
1720 PyErr_Format(PyExc_SystemError,
1721 "string is longer than the buffer");
1722 if (copy_null && 0 < targetsize)
1723 target[0] = 0;
1724 return NULL;
1725 }
1726 }
1727 if (kind != PyUnicode_4BYTE_KIND) {
1728 Py_ssize_t i;
1729 for (i = 0; i < len; i++)
1730 target[i] = PyUnicode_READ(kind, data, i);
1731 }
1732 else
1733 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1734 if (copy_null)
1735 target[len] = 0;
1736 return target;
1737}
1738
1739Py_UCS4*
1740PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1741 int copy_null)
1742{
1743 if (target == NULL || targetsize < 1) {
1744 PyErr_BadInternalCall();
1745 return NULL;
1746 }
1747 return as_ucs4(string, target, targetsize, copy_null);
1748}
1749
1750Py_UCS4*
1751PyUnicode_AsUCS4Copy(PyObject *string)
1752{
1753 return as_ucs4(string, NULL, 0, 1);
1754}
1755
1756#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757
Alexander Belopolsky40018472011-02-26 01:02:56 +00001758PyObject *
1759PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001762 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 PyErr_BadInternalCall();
1765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
1767
Martin v. Löwis790465f2008-04-05 20:41:37 +00001768 if (size == -1) {
1769 size = wcslen(w);
1770 }
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776
Walter Dörwald346737f2007-05-31 10:44:43 +00001777static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001778makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1779 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 *fmt++ = '%';
1782 if (width) {
1783 if (zeropad)
1784 *fmt++ = '0';
1785 fmt += sprintf(fmt, "%d", width);
1786 }
1787 if (precision)
1788 fmt += sprintf(fmt, ".%d", precision);
1789 if (longflag)
1790 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001791 else if (longlongflag) {
1792 /* longlongflag should only ever be nonzero on machines with
1793 HAVE_LONG_LONG defined */
1794#ifdef HAVE_LONG_LONG
1795 char *f = PY_FORMAT_LONG_LONG;
1796 while (*f)
1797 *fmt++ = *f++;
1798#else
1799 /* we shouldn't ever get here */
1800 assert(0);
1801 *fmt++ = 'l';
1802#endif
1803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 else if (size_tflag) {
1805 char *f = PY_FORMAT_SIZE_T;
1806 while (*f)
1807 *fmt++ = *f++;
1808 }
1809 *fmt++ = c;
1810 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001811}
1812
Victor Stinner96865452011-03-01 23:44:09 +00001813/* helper for PyUnicode_FromFormatV() */
1814
1815static const char*
1816parse_format_flags(const char *f,
1817 int *p_width, int *p_precision,
1818 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1819{
1820 int width, precision, longflag, longlongflag, size_tflag;
1821
1822 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1823 f++;
1824 width = 0;
1825 while (Py_ISDIGIT((unsigned)*f))
1826 width = (width*10) + *f++ - '0';
1827 precision = 0;
1828 if (*f == '.') {
1829 f++;
1830 while (Py_ISDIGIT((unsigned)*f))
1831 precision = (precision*10) + *f++ - '0';
1832 if (*f == '%') {
1833 /* "%.3%s" => f points to "3" */
1834 f--;
1835 }
1836 }
1837 if (*f == '\0') {
1838 /* bogus format "%.1" => go backward, f points to "1" */
1839 f--;
1840 }
1841 if (p_width != NULL)
1842 *p_width = width;
1843 if (p_precision != NULL)
1844 *p_precision = precision;
1845
1846 /* Handle %ld, %lu, %lld and %llu. */
1847 longflag = 0;
1848 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001849 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001850
1851 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001853 longflag = 1;
1854 ++f;
1855 }
1856#ifdef HAVE_LONG_LONG
1857 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001858 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001859 longlongflag = 1;
1860 f += 2;
1861 }
1862#endif
1863 }
1864 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001866 size_tflag = 1;
1867 ++f;
1868 }
1869 if (p_longflag != NULL)
1870 *p_longflag = longflag;
1871 if (p_longlongflag != NULL)
1872 *p_longlongflag = longlongflag;
1873 if (p_size_tflag != NULL)
1874 *p_size_tflag = size_tflag;
1875 return f;
1876}
1877
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001878/* maximum number of characters required for output of %ld. 21 characters
1879 allows for 64-bit integers (in decimal) and an optional sign. */
1880#define MAX_LONG_CHARS 21
1881/* maximum number of characters required for output of %lld.
1882 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1883 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1884#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1885
Walter Dörwaldd2034312007-05-18 16:29:38 +00001886PyObject *
1887PyUnicode_FromFormatV(const char *format, va_list vargs)
1888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 va_list count;
1890 Py_ssize_t callcount = 0;
1891 PyObject **callresults = NULL;
1892 PyObject **callresult = NULL;
1893 Py_ssize_t n = 0;
1894 int width = 0;
1895 int precision = 0;
1896 int zeropad;
1897 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001900 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1902 Py_UCS4 argmaxchar;
1903 Py_ssize_t numbersize = 0;
1904 char *numberresults = NULL;
1905 char *numberresult = NULL;
1906 Py_ssize_t i;
1907 int kind;
1908 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001910 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001911 /* step 1: count the number of %S/%R/%A/%s format specifications
1912 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1913 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 * result in an array)
1915 * also esimate a upper bound for all the number formats in the string,
1916 * numbers will be formated in step 3 and be keept in a '\0'-separated
1917 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 for (f = format; *f; f++) {
1919 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001920 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1922 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1923 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1924 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001927#ifdef HAVE_LONG_LONG
1928 if (longlongflag) {
1929 if (width < MAX_LONG_LONG_CHARS)
1930 width = MAX_LONG_LONG_CHARS;
1931 }
1932 else
1933#endif
1934 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1935 including sign. Decimal takes the most space. This
1936 isn't enough for octal. If a width is specified we
1937 need more (which we allocate later). */
1938 if (width < MAX_LONG_CHARS)
1939 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940
1941 /* account for the size + '\0' to separate numbers
1942 inside of the numberresults buffer */
1943 numbersize += (width + 1);
1944 }
1945 }
1946 else if ((unsigned char)*f > 127) {
1947 PyErr_Format(PyExc_ValueError,
1948 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1949 "string, got a non-ASCII byte: 0x%02x",
1950 (unsigned char)*f);
1951 return NULL;
1952 }
1953 }
1954 /* step 2: allocate memory for the results of
1955 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1956 if (callcount) {
1957 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1958 if (!callresults) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 callresult = callresults;
1963 }
1964 /* step 2.5: allocate memory for the results of formating numbers */
1965 if (numbersize) {
1966 numberresults = PyObject_Malloc(numbersize);
1967 if (!numberresults) {
1968 PyErr_NoMemory();
1969 goto fail;
1970 }
1971 numberresult = numberresults;
1972 }
1973
1974 /* step 3: format numbers and figure out how large a buffer we need */
1975 for (f = format; *f; f++) {
1976 if (*f == '%') {
1977 const char* p;
1978 int longflag;
1979 int longlongflag;
1980 int size_tflag;
1981 int numprinted;
1982
1983 p = f;
1984 zeropad = (f[1] == '0');
1985 f = parse_format_flags(f, &width, &precision,
1986 &longflag, &longlongflag, &size_tflag);
1987 switch (*f) {
1988 case 'c':
1989 {
1990 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001991 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 n++;
1993 break;
1994 }
1995 case '%':
1996 n++;
1997 break;
1998 case 'i':
1999 case 'd':
2000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2001 width, precision, *f);
2002 if (longflag)
2003 numprinted = sprintf(numberresult, fmt,
2004 va_arg(count, long));
2005#ifdef HAVE_LONG_LONG
2006 else if (longlongflag)
2007 numprinted = sprintf(numberresult, fmt,
2008 va_arg(count, PY_LONG_LONG));
2009#endif
2010 else if (size_tflag)
2011 numprinted = sprintf(numberresult, fmt,
2012 va_arg(count, Py_ssize_t));
2013 else
2014 numprinted = sprintf(numberresult, fmt,
2015 va_arg(count, int));
2016 n += numprinted;
2017 /* advance by +1 to skip over the '\0' */
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
2023 break;
2024 case 'u':
2025 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2026 width, precision, 'u');
2027 if (longflag)
2028 numprinted = sprintf(numberresult, fmt,
2029 va_arg(count, unsigned long));
2030#ifdef HAVE_LONG_LONG
2031 else if (longlongflag)
2032 numprinted = sprintf(numberresult, fmt,
2033 va_arg(count, unsigned PY_LONG_LONG));
2034#endif
2035 else if (size_tflag)
2036 numprinted = sprintf(numberresult, fmt,
2037 va_arg(count, size_t));
2038 else
2039 numprinted = sprintf(numberresult, fmt,
2040 va_arg(count, unsigned int));
2041 n += numprinted;
2042 numberresult += (numprinted + 1);
2043 assert(*(numberresult - 1) == '\0');
2044 assert(*(numberresult - 2) != '\0');
2045 assert(numprinted >= 0);
2046 assert(numberresult <= numberresults + numbersize);
2047 break;
2048 case 'x':
2049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2050 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2051 n += numprinted;
2052 numberresult += (numprinted + 1);
2053 assert(*(numberresult - 1) == '\0');
2054 assert(*(numberresult - 2) != '\0');
2055 assert(numprinted >= 0);
2056 assert(numberresult <= numberresults + numbersize);
2057 break;
2058 case 'p':
2059 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2060 /* %p is ill-defined: ensure leading 0x. */
2061 if (numberresult[1] == 'X')
2062 numberresult[1] = 'x';
2063 else if (numberresult[1] != 'x') {
2064 memmove(numberresult + 2, numberresult,
2065 strlen(numberresult) + 1);
2066 numberresult[0] = '0';
2067 numberresult[1] = 'x';
2068 numprinted += 2;
2069 }
2070 n += numprinted;
2071 numberresult += (numprinted + 1);
2072 assert(*(numberresult - 1) == '\0');
2073 assert(*(numberresult - 2) != '\0');
2074 assert(numprinted >= 0);
2075 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 break;
2077 case 's':
2078 {
2079 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002080 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002081 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2082 if (!str)
2083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 /* since PyUnicode_DecodeUTF8 returns already flexible
2085 unicode objects, there is no need to call ready on them */
2086 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002087 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002089 /* Remember the str and switch to the next slot */
2090 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 break;
2092 }
2093 case 'U':
2094 {
2095 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002096 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (PyUnicode_READY(obj) == -1)
2098 goto fail;
2099 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002100 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 break;
2103 }
2104 case 'V':
2105 {
2106 PyObject *obj = va_arg(count, PyObject *);
2107 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002108 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002110 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002111 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 if (PyUnicode_READY(obj) == -1)
2113 goto fail;
2114 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002115 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002117 *callresult++ = NULL;
2118 }
2119 else {
2120 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2121 if (!str_obj)
2122 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002123 if (PyUnicode_READY(str_obj)) {
2124 Py_DECREF(str_obj);
2125 goto fail;
2126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002128 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002130 *callresult++ = str_obj;
2131 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 break;
2133 }
2134 case 'S':
2135 {
2136 PyObject *obj = va_arg(count, PyObject *);
2137 PyObject *str;
2138 assert(obj);
2139 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002143 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002145 /* Remember the str and switch to the next slot */
2146 *callresult++ = str;
2147 break;
2148 }
2149 case 'R':
2150 {
2151 PyObject *obj = va_arg(count, PyObject *);
2152 PyObject *repr;
2153 assert(obj);
2154 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002158 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002160 /* Remember the repr and switch to the next slot */
2161 *callresult++ = repr;
2162 break;
2163 }
2164 case 'A':
2165 {
2166 PyObject *obj = va_arg(count, PyObject *);
2167 PyObject *ascii;
2168 assert(obj);
2169 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002173 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002175 /* Remember the repr and switch to the next slot */
2176 *callresult++ = ascii;
2177 break;
2178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 default:
2180 /* if we stumble upon an unknown
2181 formatting code, copy the rest of
2182 the format string to the output
2183 string. (we cannot just skip the
2184 code, since there's no way to know
2185 what's in the argument list) */
2186 n += strlen(p);
2187 goto expand;
2188 }
2189 } else
2190 n++;
2191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002192 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 we don't have to resize the string.
2196 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 if (!string)
2199 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 kind = PyUnicode_KIND(string);
2201 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002207 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002208
2209 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2211 /* checking for == because the last argument could be a empty
2212 string, which causes i to point to end, the assert at the end of
2213 the loop */
2214 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002215
Benjamin Peterson14339b62009-01-31 16:36:08 +00002216 switch (*f) {
2217 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002218 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 const int ordinal = va_arg(vargs, int);
2220 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002222 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002223 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002225 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 case 'p':
2228 /* unused, since we already have the result */
2229 if (*f == 'p')
2230 (void) va_arg(vargs, void *);
2231 else
2232 (void) va_arg(vargs, int);
2233 /* extract the result from numberresults and append. */
2234 for (; *numberresult; ++i, ++numberresult)
2235 PyUnicode_WRITE(kind, data, i, *numberresult);
2236 /* skip over the separating '\0' */
2237 assert(*numberresult == '\0');
2238 numberresult++;
2239 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 break;
2241 case 's':
2242 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002243 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002245 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 size = PyUnicode_GET_LENGTH(*callresult);
2247 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002248 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2249 *callresult, 0,
2250 size) < 0)
2251 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002253 /* We're done with the unicode()/repr() => forget it */
2254 Py_DECREF(*callresult);
2255 /* switch to next unicode()/repr() result */
2256 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002257 break;
2258 }
2259 case 'U':
2260 {
2261 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 Py_ssize_t size;
2263 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2264 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002265 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2266 obj, 0,
2267 size) < 0)
2268 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 break;
2271 }
2272 case 'V':
2273 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002275 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002276 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 size = PyUnicode_GET_LENGTH(obj);
2279 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002280 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2281 obj, 0,
2282 size) < 0)
2283 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 size = PyUnicode_GET_LENGTH(*callresult);
2287 assert(PyUnicode_KIND(*callresult) <=
2288 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002289 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2290 *callresult,
2291 0, size) < 0)
2292 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002294 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002296 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 break;
2298 }
2299 case 'S':
2300 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002301 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002302 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002303 /* unused, since we already have the result */
2304 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002305 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002306 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2307 *callresult, 0,
2308 PyUnicode_GET_LENGTH(*callresult)) < 0)
2309 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002311 /* We're done with the unicode()/repr() => forget it */
2312 Py_DECREF(*callresult);
2313 /* switch to next unicode()/repr() result */
2314 ++callresult;
2315 break;
2316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002317 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 break;
2320 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321 for (; *p; ++p, ++i)
2322 PyUnicode_WRITE(kind, data, i, *p);
2323 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 goto end;
2325 }
Victor Stinner1205f272010-09-11 00:54:47 +00002326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 else {
2328 assert(i < PyUnicode_GET_LENGTH(string));
2329 PyUnicode_WRITE(kind, data, i++, *f);
2330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002333
Benjamin Peterson29060642009-01-31 22:14:21 +00002334 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 if (callresults)
2336 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 if (numberresults)
2338 PyObject_Free(numberresults);
2339 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 if (callresults) {
2342 PyObject **callresult2 = callresults;
2343 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002344 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 ++callresult2;
2346 }
2347 PyObject_Free(callresults);
2348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 if (numberresults)
2350 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002351 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002352}
2353
Walter Dörwaldd2034312007-05-18 16:29:38 +00002354PyObject *
2355PyUnicode_FromFormat(const char *format, ...)
2356{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 PyObject* ret;
2358 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002359
2360#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002362#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002364#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 ret = PyUnicode_FromFormatV(format, vargs);
2366 va_end(vargs);
2367 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002368}
2369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370#ifdef HAVE_WCHAR_H
2371
Victor Stinner5593d8a2010-10-02 11:11:27 +00002372/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2373 convert a Unicode object to a wide character string.
2374
Victor Stinnerd88d9832011-09-06 02:00:05 +02002375 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002376 character) required to convert the unicode object. Ignore size argument.
2377
Victor Stinnerd88d9832011-09-06 02:00:05 +02002378 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002379 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002380 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002381static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002382unicode_aswidechar(PyUnicodeObject *unicode,
2383 wchar_t *w,
2384 Py_ssize_t size)
2385{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002386 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 const wchar_t *wstr;
2388
2389 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2390 if (wstr == NULL)
2391 return -1;
2392
Victor Stinner5593d8a2010-10-02 11:11:27 +00002393 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002394 if (size > res)
2395 size = res + 1;
2396 else
2397 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002399 return res;
2400 }
2401 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002403}
2404
2405Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002406PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002407 wchar_t *w,
2408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409{
2410 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002411 PyErr_BadInternalCall();
2412 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002414 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415}
2416
Victor Stinner137c34c2010-09-29 10:25:54 +00002417wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002418PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002419 Py_ssize_t *size)
2420{
2421 wchar_t* buffer;
2422 Py_ssize_t buflen;
2423
2424 if (unicode == NULL) {
2425 PyErr_BadInternalCall();
2426 return NULL;
2427 }
2428
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002429 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 if (buflen == -1)
2431 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002432 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436
Victor Stinner137c34c2010-09-29 10:25:54 +00002437 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2438 if (buffer == NULL) {
2439 PyErr_NoMemory();
2440 return NULL;
2441 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002442 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 if (buflen == -1)
2444 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002445 if (size != NULL)
2446 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002447 return buffer;
2448}
2449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451
Alexander Belopolsky40018472011-02-26 01:02:56 +00002452PyObject *
2453PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002456 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 PyErr_SetString(PyExc_ValueError,
2458 "chr() arg not in range(0x110000)");
2459 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002460 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 if (ordinal < 256)
2463 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 v = PyUnicode_New(1, ordinal);
2466 if (v == NULL)
2467 return NULL;
2468 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2469 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002470}
2471
Alexander Belopolsky40018472011-02-26 01:02:56 +00002472PyObject *
2473PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002475 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002477 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002478 if (PyUnicode_READY(obj))
2479 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 Py_INCREF(obj);
2481 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002482 }
2483 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002484 /* For a Unicode subtype that's not a Unicode object,
2485 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002486 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002487 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002488 PyErr_Format(PyExc_TypeError,
2489 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002490 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002491 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002492}
2493
Alexander Belopolsky40018472011-02-26 01:02:56 +00002494PyObject *
2495PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002496 const char *encoding,
2497 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002498{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002499 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002500 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002501
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 PyErr_BadInternalCall();
2504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002506
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002507 /* Decoding bytes objects is the most common case and should be fast */
2508 if (PyBytes_Check(obj)) {
2509 if (PyBytes_GET_SIZE(obj) == 0) {
2510 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002511 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002512 }
2513 else {
2514 v = PyUnicode_Decode(
2515 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2516 encoding, errors);
2517 }
2518 return v;
2519 }
2520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002521 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 PyErr_SetString(PyExc_TypeError,
2523 "decoding str is not supported");
2524 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002526
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002527 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2528 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2529 PyErr_Format(PyExc_TypeError,
2530 "coercing to str: need bytes, bytearray "
2531 "or buffer-like object, %.80s found",
2532 Py_TYPE(obj)->tp_name);
2533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002534 }
Tim Petersced69f82003-09-16 20:30:58 +00002535
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002536 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002537 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002538 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Tim Petersced69f82003-09-16 20:30:58 +00002540 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002541 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002542
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002543 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002544 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545}
2546
Victor Stinner600d3be2010-06-10 12:00:55 +00002547/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002548 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2549 1 on success. */
2550static int
2551normalize_encoding(const char *encoding,
2552 char *lower,
2553 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002555 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002556 char *l;
2557 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002559 e = encoding;
2560 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002561 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002562 while (*e) {
2563 if (l == l_end)
2564 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002565 if (Py_ISUPPER(*e)) {
2566 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002567 }
2568 else if (*e == '_') {
2569 *l++ = '-';
2570 e++;
2571 }
2572 else {
2573 *l++ = *e++;
2574 }
2575 }
2576 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002577 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002578}
2579
Alexander Belopolsky40018472011-02-26 01:02:56 +00002580PyObject *
2581PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002582 Py_ssize_t size,
2583 const char *encoding,
2584 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002585{
2586 PyObject *buffer = NULL, *unicode;
2587 Py_buffer info;
2588 char lower[11]; /* Enough for any encoding shortcut */
2589
2590 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002591 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002592
2593 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002594 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002595 if ((strcmp(lower, "utf-8") == 0) ||
2596 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002597 return PyUnicode_DecodeUTF8(s, size, errors);
2598 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002599 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002600 (strcmp(lower, "iso-8859-1") == 0))
2601 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002602#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002603 else if (strcmp(lower, "mbcs") == 0)
2604 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002605#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002606 else if (strcmp(lower, "ascii") == 0)
2607 return PyUnicode_DecodeASCII(s, size, errors);
2608 else if (strcmp(lower, "utf-16") == 0)
2609 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2610 else if (strcmp(lower, "utf-32") == 0)
2611 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613
2614 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002615 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002616 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002617 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002618 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 if (buffer == NULL)
2620 goto onError;
2621 unicode = PyCodec_Decode(buffer, encoding, errors);
2622 if (unicode == NULL)
2623 goto onError;
2624 if (!PyUnicode_Check(unicode)) {
2625 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002626 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002627 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 Py_DECREF(unicode);
2629 goto onError;
2630 }
2631 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002632#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002633 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 Py_DECREF(unicode);
2635 return NULL;
2636 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002637#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002639
Benjamin Peterson29060642009-01-31 22:14:21 +00002640 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 Py_XDECREF(buffer);
2642 return NULL;
2643}
2644
Alexander Belopolsky40018472011-02-26 01:02:56 +00002645PyObject *
2646PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002647 const char *encoding,
2648 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002649{
2650 PyObject *v;
2651
2652 if (!PyUnicode_Check(unicode)) {
2653 PyErr_BadArgument();
2654 goto onError;
2655 }
2656
2657 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002659
2660 /* Decode via the codec registry */
2661 v = PyCodec_Decode(unicode, encoding, errors);
2662 if (v == NULL)
2663 goto onError;
2664 return v;
2665
Benjamin Peterson29060642009-01-31 22:14:21 +00002666 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002667 return NULL;
2668}
2669
Alexander Belopolsky40018472011-02-26 01:02:56 +00002670PyObject *
2671PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002672 const char *encoding,
2673 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002674{
2675 PyObject *v;
2676
2677 if (!PyUnicode_Check(unicode)) {
2678 PyErr_BadArgument();
2679 goto onError;
2680 }
2681
2682 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002684
2685 /* Decode via the codec registry */
2686 v = PyCodec_Decode(unicode, encoding, errors);
2687 if (v == NULL)
2688 goto onError;
2689 if (!PyUnicode_Check(v)) {
2690 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002691 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002692 Py_TYPE(v)->tp_name);
2693 Py_DECREF(v);
2694 goto onError;
2695 }
2696 return v;
2697
Benjamin Peterson29060642009-01-31 22:14:21 +00002698 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002699 return NULL;
2700}
2701
Alexander Belopolsky40018472011-02-26 01:02:56 +00002702PyObject *
2703PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002704 Py_ssize_t size,
2705 const char *encoding,
2706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707{
2708 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002709
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 unicode = PyUnicode_FromUnicode(s, size);
2711 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2714 Py_DECREF(unicode);
2715 return v;
2716}
2717
Alexander Belopolsky40018472011-02-26 01:02:56 +00002718PyObject *
2719PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002720 const char *encoding,
2721 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002722{
2723 PyObject *v;
2724
2725 if (!PyUnicode_Check(unicode)) {
2726 PyErr_BadArgument();
2727 goto onError;
2728 }
2729
2730 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002732
2733 /* Encode via the codec registry */
2734 v = PyCodec_Encode(unicode, encoding, errors);
2735 if (v == NULL)
2736 goto onError;
2737 return v;
2738
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002740 return NULL;
2741}
2742
Victor Stinnerad158722010-10-27 00:25:46 +00002743PyObject *
2744PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002745{
Victor Stinner99b95382011-07-04 14:23:54 +02002746#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002747 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2750#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002752#else
Victor Stinner793b5312011-04-27 00:24:21 +02002753 PyInterpreterState *interp = PyThreadState_GET()->interp;
2754 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2755 cannot use it to encode and decode filenames before it is loaded. Load
2756 the Python codec requires to encode at least its own filename. Use the C
2757 version of the locale codec until the codec registry is initialized and
2758 the Python codec is loaded.
2759
2760 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2761 cannot only rely on it: check also interp->fscodec_initialized for
2762 subinterpreters. */
2763 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002764 return PyUnicode_AsEncodedString(unicode,
2765 Py_FileSystemDefaultEncoding,
2766 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002767 }
2768 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002769 /* locale encoding with surrogateescape */
2770 wchar_t *wchar;
2771 char *bytes;
2772 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002773 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002774
2775 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2776 if (wchar == NULL)
2777 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002778 bytes = _Py_wchar2char(wchar, &error_pos);
2779 if (bytes == NULL) {
2780 if (error_pos != (size_t)-1) {
2781 char *errmsg = strerror(errno);
2782 PyObject *exc = NULL;
2783 if (errmsg == NULL)
2784 errmsg = "Py_wchar2char() failed";
2785 raise_encode_exception(&exc,
2786 "filesystemencoding",
2787 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2788 error_pos, error_pos+1,
2789 errmsg);
2790 Py_XDECREF(exc);
2791 }
2792 else
2793 PyErr_NoMemory();
2794 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002795 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002796 }
2797 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002798
2799 bytes_obj = PyBytes_FromString(bytes);
2800 PyMem_Free(bytes);
2801 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002802 }
Victor Stinnerad158722010-10-27 00:25:46 +00002803#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002804}
2805
Alexander Belopolsky40018472011-02-26 01:02:56 +00002806PyObject *
2807PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002808 const char *encoding,
2809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810{
2811 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002812 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002813
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 if (!PyUnicode_Check(unicode)) {
2815 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
Fred Drakee4315f52000-05-09 19:53:39 +00002818
Victor Stinner2f283c22011-03-02 01:21:46 +00002819 if (encoding == NULL) {
2820 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002824 }
Fred Drakee4315f52000-05-09 19:53:39 +00002825
2826 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002827 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002828 if ((strcmp(lower, "utf-8") == 0) ||
2829 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002830 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002831 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002833 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002835 }
Victor Stinner37296e82010-06-10 13:36:23 +00002836 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002837 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002838 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002840#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002841 else if (strcmp(lower, "mbcs") == 0)
2842 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2843 PyUnicode_GET_SIZE(unicode),
2844 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002845#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002846 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002847 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849
2850 /* Encode via the codec registry */
2851 v = PyCodec_Encode(unicode, encoding, errors);
2852 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002853 return NULL;
2854
2855 /* The normal path */
2856 if (PyBytes_Check(v))
2857 return v;
2858
2859 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002860 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002861 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002862 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002863
2864 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2865 "encoder %s returned bytearray instead of bytes",
2866 encoding);
2867 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002868 Py_DECREF(v);
2869 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002870 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002871
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002872 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2873 Py_DECREF(v);
2874 return b;
2875 }
2876
2877 PyErr_Format(PyExc_TypeError,
2878 "encoder did not return a bytes object (type=%.400s)",
2879 Py_TYPE(v)->tp_name);
2880 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002881 return NULL;
2882}
2883
Alexander Belopolsky40018472011-02-26 01:02:56 +00002884PyObject *
2885PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002886 const char *encoding,
2887 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002888{
2889 PyObject *v;
2890
2891 if (!PyUnicode_Check(unicode)) {
2892 PyErr_BadArgument();
2893 goto onError;
2894 }
2895
2896 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002898
2899 /* Encode via the codec registry */
2900 v = PyCodec_Encode(unicode, encoding, errors);
2901 if (v == NULL)
2902 goto onError;
2903 if (!PyUnicode_Check(v)) {
2904 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002905 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002906 Py_TYPE(v)->tp_name);
2907 Py_DECREF(v);
2908 goto onError;
2909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002911
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 return NULL;
2914}
2915
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002916PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002917PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002918 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002919 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2920}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002921
Christian Heimes5894ba72007-11-04 11:43:14 +00002922PyObject*
2923PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2924{
Victor Stinner99b95382011-07-04 14:23:54 +02002925#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002926 return PyUnicode_DecodeMBCS(s, size, NULL);
2927#elif defined(__APPLE__)
2928 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2929#else
Victor Stinner793b5312011-04-27 00:24:21 +02002930 PyInterpreterState *interp = PyThreadState_GET()->interp;
2931 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2932 cannot use it to encode and decode filenames before it is loaded. Load
2933 the Python codec requires to encode at least its own filename. Use the C
2934 version of the locale codec until the codec registry is initialized and
2935 the Python codec is loaded.
2936
2937 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2938 cannot only rely on it: check also interp->fscodec_initialized for
2939 subinterpreters. */
2940 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002941 return PyUnicode_Decode(s, size,
2942 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002943 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002944 }
2945 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002946 /* locale encoding with surrogateescape */
2947 wchar_t *wchar;
2948 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002949 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002950
2951 if (s[size] != '\0' || size != strlen(s)) {
2952 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2953 return NULL;
2954 }
2955
Victor Stinner168e1172010-10-16 23:16:16 +00002956 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002957 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002958 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002959
Victor Stinner168e1172010-10-16 23:16:16 +00002960 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002961 PyMem_Free(wchar);
2962 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002963 }
Victor Stinnerad158722010-10-27 00:25:46 +00002964#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002965}
2966
Martin v. Löwis011e8422009-05-05 04:43:17 +00002967
2968int
2969PyUnicode_FSConverter(PyObject* arg, void* addr)
2970{
2971 PyObject *output = NULL;
2972 Py_ssize_t size;
2973 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002974 if (arg == NULL) {
2975 Py_DECREF(*(PyObject**)addr);
2976 return 1;
2977 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002978 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002979 output = arg;
2980 Py_INCREF(output);
2981 }
2982 else {
2983 arg = PyUnicode_FromObject(arg);
2984 if (!arg)
2985 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002986 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002987 Py_DECREF(arg);
2988 if (!output)
2989 return 0;
2990 if (!PyBytes_Check(output)) {
2991 Py_DECREF(output);
2992 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2993 return 0;
2994 }
2995 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002996 size = PyBytes_GET_SIZE(output);
2997 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002998 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002999 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003000 Py_DECREF(output);
3001 return 0;
3002 }
3003 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003004 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003005}
3006
3007
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003008int
3009PyUnicode_FSDecoder(PyObject* arg, void* addr)
3010{
3011 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003012 if (arg == NULL) {
3013 Py_DECREF(*(PyObject**)addr);
3014 return 1;
3015 }
3016 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003017 if (PyUnicode_READY(arg))
3018 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003019 output = arg;
3020 Py_INCREF(output);
3021 }
3022 else {
3023 arg = PyBytes_FromObject(arg);
3024 if (!arg)
3025 return 0;
3026 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3027 PyBytes_GET_SIZE(arg));
3028 Py_DECREF(arg);
3029 if (!output)
3030 return 0;
3031 if (!PyUnicode_Check(output)) {
3032 Py_DECREF(output);
3033 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3034 return 0;
3035 }
3036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3038 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003039 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3040 Py_DECREF(output);
3041 return 0;
3042 }
3043 *(PyObject**)addr = output;
3044 return Py_CLEANUP_SUPPORTED;
3045}
3046
3047
Martin v. Löwis5b222132007-06-10 09:51:05 +00003048char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003049PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003050{
Christian Heimesf3863112007-11-22 07:46:41 +00003051 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003052 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3053
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003054 if (!PyUnicode_Check(unicode)) {
3055 PyErr_BadArgument();
3056 return NULL;
3057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003058 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003059 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003060
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003061 if (PyUnicode_UTF8(unicode) == NULL) {
3062 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3064 if (bytes == NULL)
3065 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003066 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3067 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 Py_DECREF(bytes);
3069 return NULL;
3070 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003071 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3072 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003073 Py_DECREF(bytes);
3074 }
3075
3076 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003077 *psize = PyUnicode_UTF8_LENGTH(unicode);
3078 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003079}
3080
3081char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003084 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3085}
3086
3087#ifdef Py_DEBUG
3088int unicode_as_unicode_calls = 0;
3089#endif
3090
3091
3092Py_UNICODE *
3093PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3094{
3095 PyUnicodeObject *u;
3096 const unsigned char *one_byte;
3097#if SIZEOF_WCHAR_T == 4
3098 const Py_UCS2 *two_bytes;
3099#else
3100 const Py_UCS4 *four_bytes;
3101 const Py_UCS4 *ucs4_end;
3102 Py_ssize_t num_surrogates;
3103#endif
3104 wchar_t *w;
3105 wchar_t *wchar_end;
3106
3107 if (!PyUnicode_Check(unicode)) {
3108 PyErr_BadArgument();
3109 return NULL;
3110 }
3111 u = (PyUnicodeObject*)unicode;
3112 if (_PyUnicode_WSTR(u) == NULL) {
3113 /* Non-ASCII compact unicode object */
3114 assert(_PyUnicode_KIND(u) != 0);
3115 assert(PyUnicode_IS_READY(u));
3116
3117#ifdef Py_DEBUG
3118 ++unicode_as_unicode_calls;
3119#endif
3120
3121 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3122#if SIZEOF_WCHAR_T == 2
3123 four_bytes = PyUnicode_4BYTE_DATA(u);
3124 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3125 num_surrogates = 0;
3126
3127 for (; four_bytes < ucs4_end; ++four_bytes) {
3128 if (*four_bytes > 0xFFFF)
3129 ++num_surrogates;
3130 }
3131
3132 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3133 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3134 if (!_PyUnicode_WSTR(u)) {
3135 PyErr_NoMemory();
3136 return NULL;
3137 }
3138 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3139
3140 w = _PyUnicode_WSTR(u);
3141 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3142 four_bytes = PyUnicode_4BYTE_DATA(u);
3143 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3144 if (*four_bytes > 0xFFFF) {
3145 /* encode surrogate pair in this case */
3146 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3147 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3148 }
3149 else
3150 *w = *four_bytes;
3151
3152 if (w > wchar_end) {
3153 assert(0 && "Miscalculated string end");
3154 }
3155 }
3156 *w = 0;
3157#else
3158 /* sizeof(wchar_t) == 4 */
3159 Py_FatalError("Impossible unicode object state, wstr and str "
3160 "should share memory already.");
3161 return NULL;
3162#endif
3163 }
3164 else {
3165 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3166 (_PyUnicode_LENGTH(u) + 1));
3167 if (!_PyUnicode_WSTR(u)) {
3168 PyErr_NoMemory();
3169 return NULL;
3170 }
3171 if (!PyUnicode_IS_COMPACT_ASCII(u))
3172 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3173 w = _PyUnicode_WSTR(u);
3174 wchar_end = w + _PyUnicode_LENGTH(u);
3175
3176 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3177 one_byte = PyUnicode_1BYTE_DATA(u);
3178 for (; w < wchar_end; ++one_byte, ++w)
3179 *w = *one_byte;
3180 /* null-terminate the wstr */
3181 *w = 0;
3182 }
3183 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3184#if SIZEOF_WCHAR_T == 4
3185 two_bytes = PyUnicode_2BYTE_DATA(u);
3186 for (; w < wchar_end; ++two_bytes, ++w)
3187 *w = *two_bytes;
3188 /* null-terminate the wstr */
3189 *w = 0;
3190#else
3191 /* sizeof(wchar_t) == 2 */
3192 PyObject_FREE(_PyUnicode_WSTR(u));
3193 _PyUnicode_WSTR(u) = NULL;
3194 Py_FatalError("Impossible unicode object state, wstr "
3195 "and str should share memory already.");
3196 return NULL;
3197#endif
3198 }
3199 else {
3200 assert(0 && "This should never happen.");
3201 }
3202 }
3203 }
3204 if (size != NULL)
3205 *size = PyUnicode_WSTR_LENGTH(u);
3206 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003207}
3208
Alexander Belopolsky40018472011-02-26 01:02:56 +00003209Py_UNICODE *
3210PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213}
3214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003215
Alexander Belopolsky40018472011-02-26 01:02:56 +00003216Py_ssize_t
3217PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218{
3219 if (!PyUnicode_Check(unicode)) {
3220 PyErr_BadArgument();
3221 goto onError;
3222 }
3223 return PyUnicode_GET_SIZE(unicode);
3224
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 return -1;
3227}
3228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003229Py_ssize_t
3230PyUnicode_GetLength(PyObject *unicode)
3231{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003232 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003233 PyErr_BadArgument();
3234 return -1;
3235 }
3236
3237 return PyUnicode_GET_LENGTH(unicode);
3238}
3239
3240Py_UCS4
3241PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3242{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003243 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3244 PyErr_BadArgument();
3245 return (Py_UCS4)-1;
3246 }
3247 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3248 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003249 return (Py_UCS4)-1;
3250 }
3251 return PyUnicode_READ_CHAR(unicode, index);
3252}
3253
3254int
3255PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3256{
3257 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003258 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259 return -1;
3260 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003261 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3262 PyErr_SetString(PyExc_IndexError, "string index out of range");
3263 return -1;
3264 }
3265 if (_PyUnicode_Dirty(unicode))
3266 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3268 index, ch);
3269 return 0;
3270}
3271
Alexander Belopolsky40018472011-02-26 01:02:56 +00003272const char *
3273PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003274{
Victor Stinner42cb4622010-09-01 19:39:01 +00003275 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003276}
3277
Victor Stinner554f3f02010-06-16 23:33:54 +00003278/* create or adjust a UnicodeDecodeError */
3279static void
3280make_decode_exception(PyObject **exceptionObject,
3281 const char *encoding,
3282 const char *input, Py_ssize_t length,
3283 Py_ssize_t startpos, Py_ssize_t endpos,
3284 const char *reason)
3285{
3286 if (*exceptionObject == NULL) {
3287 *exceptionObject = PyUnicodeDecodeError_Create(
3288 encoding, input, length, startpos, endpos, reason);
3289 }
3290 else {
3291 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3292 goto onError;
3293 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3294 goto onError;
3295 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3296 goto onError;
3297 }
3298 return;
3299
3300onError:
3301 Py_DECREF(*exceptionObject);
3302 *exceptionObject = NULL;
3303}
3304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305/* error handling callback helper:
3306 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003307 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 and adjust various state variables.
3309 return 0 on success, -1 on error
3310*/
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312static int
3313unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 const char *encoding, const char *reason,
3315 const char **input, const char **inend, Py_ssize_t *startinpos,
3316 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3317 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003319 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320
3321 PyObject *restuple = NULL;
3322 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003324 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003325 Py_ssize_t requiredsize;
3326 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003327 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003328 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003329 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 int res = -1;
3331
3332 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 *errorHandler = PyCodec_LookupError(errors);
3334 if (*errorHandler == NULL)
3335 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 }
3337
Victor Stinner554f3f02010-06-16 23:33:54 +00003338 make_decode_exception(exceptionObject,
3339 encoding,
3340 *input, *inend - *input,
3341 *startinpos, *endinpos,
3342 reason);
3343 if (*exceptionObject == NULL)
3344 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345
3346 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3347 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003350 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003351 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 }
3353 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003355
3356 /* Copy back the bytes variables, which might have been modified by the
3357 callback */
3358 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3359 if (!inputobj)
3360 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003361 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003363 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003364 *input = PyBytes_AS_STRING(inputobj);
3365 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003366 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003367 /* we can DECREF safely, as the exception has another reference,
3368 so the object won't go away. */
3369 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003373 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3375 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377
3378 /* need more space? (at least enough for what we
3379 have+the replacement+the rest of the string (starting
3380 at the new input position), so we won't have to check space
3381 when there are no errors in the rest of the string) */
3382 repptr = PyUnicode_AS_UNICODE(repunicode);
3383 repsize = PyUnicode_GET_SIZE(repunicode);
3384 requiredsize = *outpos + repsize + insize-newpos;
3385 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 if (requiredsize<2*outsize)
3387 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003388 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 goto onError;
3390 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391 }
3392 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003393 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 Py_UNICODE_COPY(*outptr, repptr, repsize);
3395 *outptr += repsize;
3396 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 /* we made it! */
3399 res = 0;
3400
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 Py_XDECREF(restuple);
3403 return res;
3404}
3405
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003406/* --- UTF-7 Codec -------------------------------------------------------- */
3407
Antoine Pitrou244651a2009-05-04 18:56:13 +00003408/* See RFC2152 for details. We encode conservatively and decode liberally. */
3409
3410/* Three simple macros defining base-64. */
3411
3412/* Is c a base-64 character? */
3413
3414#define IS_BASE64(c) \
3415 (((c) >= 'A' && (c) <= 'Z') || \
3416 ((c) >= 'a' && (c) <= 'z') || \
3417 ((c) >= '0' && (c) <= '9') || \
3418 (c) == '+' || (c) == '/')
3419
3420/* given that c is a base-64 character, what is its base-64 value? */
3421
3422#define FROM_BASE64(c) \
3423 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3424 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3425 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3426 (c) == '+' ? 62 : 63)
3427
3428/* What is the base-64 character of the bottom 6 bits of n? */
3429
3430#define TO_BASE64(n) \
3431 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3432
3433/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3434 * decoded as itself. We are permissive on decoding; the only ASCII
3435 * byte not decoding to itself is the + which begins a base64
3436 * string. */
3437
3438#define DECODE_DIRECT(c) \
3439 ((c) <= 127 && (c) != '+')
3440
3441/* The UTF-7 encoder treats ASCII characters differently according to
3442 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3443 * the above). See RFC2152. This array identifies these different
3444 * sets:
3445 * 0 : "Set D"
3446 * alphanumeric and '(),-./:?
3447 * 1 : "Set O"
3448 * !"#$%&*;<=>@[]^_`{|}
3449 * 2 : "whitespace"
3450 * ht nl cr sp
3451 * 3 : special (must be base64 encoded)
3452 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3453 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003454
Tim Petersced69f82003-09-16 20:30:58 +00003455static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003456char utf7_category[128] = {
3457/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3458 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3459/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3460 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3461/* sp ! " # $ % & ' ( ) * + , - . / */
3462 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3463/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3465/* @ A B C D E F G H I J K L M N O */
3466 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3469/* ` a b c d e f g h i j k l m n o */
3470 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3471/* p q r s t u v w x y z { | } ~ del */
3472 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003473};
3474
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475/* ENCODE_DIRECT: this character should be encoded as itself. The
3476 * answer depends on whether we are encoding set O as itself, and also
3477 * on whether we are encoding whitespace as itself. RFC2152 makes it
3478 * clear that the answers to these questions vary between
3479 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003480
Antoine Pitrou244651a2009-05-04 18:56:13 +00003481#define ENCODE_DIRECT(c, directO, directWS) \
3482 ((c) < 128 && (c) > 0 && \
3483 ((utf7_category[(c)] == 0) || \
3484 (directWS && (utf7_category[(c)] == 2)) || \
3485 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003486
Alexander Belopolsky40018472011-02-26 01:02:56 +00003487PyObject *
3488PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003489 Py_ssize_t size,
3490 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003491{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003492 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3493}
3494
Antoine Pitrou244651a2009-05-04 18:56:13 +00003495/* The decoder. The only state we preserve is our read position,
3496 * i.e. how many characters we have consumed. So if we end in the
3497 * middle of a shift sequence we have to back off the read position
3498 * and the output to the beginning of the sequence, otherwise we lose
3499 * all the shift state (seen bits, number of bits seen, high
3500 * surrogate). */
3501
Alexander Belopolsky40018472011-02-26 01:02:56 +00003502PyObject *
3503PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003504 Py_ssize_t size,
3505 const char *errors,
3506 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003509 Py_ssize_t startinpos;
3510 Py_ssize_t endinpos;
3511 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003512 const char *e;
3513 PyUnicodeObject *unicode;
3514 Py_UNICODE *p;
3515 const char *errmsg = "";
3516 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003517 Py_UNICODE *shiftOutStart;
3518 unsigned int base64bits = 0;
3519 unsigned long base64buffer = 0;
3520 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 PyObject *errorHandler = NULL;
3522 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003523
3524 unicode = _PyUnicode_New(size);
3525 if (!unicode)
3526 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003527 if (size == 0) {
3528 if (consumed)
3529 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003530 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003531 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003534 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003535 e = s + size;
3536
3537 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003540 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003541
Antoine Pitrou244651a2009-05-04 18:56:13 +00003542 if (inShift) { /* in a base-64 section */
3543 if (IS_BASE64(ch)) { /* consume a base-64 character */
3544 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3545 base64bits += 6;
3546 s++;
3547 if (base64bits >= 16) {
3548 /* we have enough bits for a UTF-16 value */
3549 Py_UNICODE outCh = (Py_UNICODE)
3550 (base64buffer >> (base64bits-16));
3551 base64bits -= 16;
3552 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3553 if (surrogate) {
3554 /* expecting a second surrogate */
3555 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3556#ifdef Py_UNICODE_WIDE
3557 *p++ = (((surrogate & 0x3FF)<<10)
3558 | (outCh & 0x3FF)) + 0x10000;
3559#else
3560 *p++ = surrogate;
3561 *p++ = outCh;
3562#endif
3563 surrogate = 0;
3564 }
3565 else {
3566 surrogate = 0;
3567 errmsg = "second surrogate missing";
3568 goto utf7Error;
3569 }
3570 }
3571 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3572 /* first surrogate */
3573 surrogate = outCh;
3574 }
3575 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3576 errmsg = "unexpected second surrogate";
3577 goto utf7Error;
3578 }
3579 else {
3580 *p++ = outCh;
3581 }
3582 }
3583 }
3584 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 inShift = 0;
3586 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003587 if (surrogate) {
3588 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003589 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003591 if (base64bits > 0) { /* left-over bits */
3592 if (base64bits >= 6) {
3593 /* We've seen at least one base-64 character */
3594 errmsg = "partial character in shift sequence";
3595 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003597 else {
3598 /* Some bits remain; they should be zero */
3599 if (base64buffer != 0) {
3600 errmsg = "non-zero padding bits in shift sequence";
3601 goto utf7Error;
3602 }
3603 }
3604 }
3605 if (ch != '-') {
3606 /* '-' is absorbed; other terminating
3607 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003608 *p++ = ch;
3609 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 }
3611 }
3612 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003614 s++; /* consume '+' */
3615 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 s++;
3617 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003618 }
3619 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003621 shiftOutStart = p;
3622 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003623 }
3624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003625 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003626 *p++ = ch;
3627 s++;
3628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003629 else {
3630 startinpos = s-starts;
3631 s++;
3632 errmsg = "unexpected special character";
3633 goto utf7Error;
3634 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003635 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003636utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 outpos = p-PyUnicode_AS_UNICODE(unicode);
3638 endinpos = s-starts;
3639 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 errors, &errorHandler,
3641 "utf7", errmsg,
3642 &starts, &e, &startinpos, &endinpos, &exc, &s,
3643 &unicode, &outpos, &p))
3644 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003645 }
3646
Antoine Pitrou244651a2009-05-04 18:56:13 +00003647 /* end of string */
3648
3649 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3650 /* if we're in an inconsistent state, that's an error */
3651 if (surrogate ||
3652 (base64bits >= 6) ||
3653 (base64bits > 0 && base64buffer != 0)) {
3654 outpos = p-PyUnicode_AS_UNICODE(unicode);
3655 endinpos = size;
3656 if (unicode_decode_call_errorhandler(
3657 errors, &errorHandler,
3658 "utf7", "unterminated shift sequence",
3659 &starts, &e, &startinpos, &endinpos, &exc, &s,
3660 &unicode, &outpos, &p))
3661 goto onError;
3662 if (s < e)
3663 goto restart;
3664 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003666
3667 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003668 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 if (inShift) {
3670 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003671 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003672 }
3673 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003674 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003675 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677
Victor Stinnerfe226c02011-10-03 03:52:20 +02003678 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003679 goto onError;
3680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_XDECREF(errorHandler);
3682 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003683#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003684 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685 Py_DECREF(unicode);
3686 return NULL;
3687 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003688#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 return (PyObject *)unicode;
3690
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 Py_XDECREF(errorHandler);
3693 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003694 Py_DECREF(unicode);
3695 return NULL;
3696}
3697
3698
Alexander Belopolsky40018472011-02-26 01:02:56 +00003699PyObject *
3700PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003701 Py_ssize_t size,
3702 int base64SetO,
3703 int base64WhiteSpace,
3704 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003705{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003706 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003707 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003708 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003709 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003710 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003711 unsigned int base64bits = 0;
3712 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003713 char * out;
3714 char * start;
3715
3716 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003719 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003720 return PyErr_NoMemory();
3721
Antoine Pitrou244651a2009-05-04 18:56:13 +00003722 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003723 if (v == NULL)
3724 return NULL;
3725
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003726 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727 for (;i < size; ++i) {
3728 Py_UNICODE ch = s[i];
3729
Antoine Pitrou244651a2009-05-04 18:56:13 +00003730 if (inShift) {
3731 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3732 /* shifting out */
3733 if (base64bits) { /* output remaining bits */
3734 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3735 base64buffer = 0;
3736 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003737 }
3738 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739 /* Characters not in the BASE64 set implicitly unshift the sequence
3740 so no '-' is required, except if the character is itself a '-' */
3741 if (IS_BASE64(ch) || ch == '-') {
3742 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003743 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003744 *out++ = (char) ch;
3745 }
3746 else {
3747 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003749 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003750 else { /* not in a shift sequence */
3751 if (ch == '+') {
3752 *out++ = '+';
3753 *out++ = '-';
3754 }
3755 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3756 *out++ = (char) ch;
3757 }
3758 else {
3759 *out++ = '+';
3760 inShift = 1;
3761 goto encode_char;
3762 }
3763 }
3764 continue;
3765encode_char:
3766#ifdef Py_UNICODE_WIDE
3767 if (ch >= 0x10000) {
3768 /* code first surrogate */
3769 base64bits += 16;
3770 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3771 while (base64bits >= 6) {
3772 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3773 base64bits -= 6;
3774 }
3775 /* prepare second surrogate */
3776 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3777 }
3778#endif
3779 base64bits += 16;
3780 base64buffer = (base64buffer << 16) | ch;
3781 while (base64bits >= 6) {
3782 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3783 base64bits -= 6;
3784 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003785 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786 if (base64bits)
3787 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3788 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003790 if (_PyBytes_Resize(&v, out - start) < 0)
3791 return NULL;
3792 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003793}
3794
Antoine Pitrou244651a2009-05-04 18:56:13 +00003795#undef IS_BASE64
3796#undef FROM_BASE64
3797#undef TO_BASE64
3798#undef DECODE_DIRECT
3799#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801/* --- UTF-8 Codec -------------------------------------------------------- */
3802
Tim Petersced69f82003-09-16 20:30:58 +00003803static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003805 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3806 illegal prefix. See RFC 3629 for details */
3807 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003809 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3812 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3813 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003814 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3819 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3820 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3821 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3822 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823};
3824
Alexander Belopolsky40018472011-02-26 01:02:56 +00003825PyObject *
3826PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003827 Py_ssize_t size,
3828 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829{
Walter Dörwald69652032004-09-07 20:24:22 +00003830 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3831}
3832
Antoine Pitrouab868312009-01-10 15:40:25 +00003833/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3834#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3835
3836/* Mask to quickly check whether a C 'long' contains a
3837 non-ASCII, UTF8-encoded char. */
3838#if (SIZEOF_LONG == 8)
3839# define ASCII_CHAR_MASK 0x8080808080808080L
3840#elif (SIZEOF_LONG == 4)
3841# define ASCII_CHAR_MASK 0x80808080L
3842#else
3843# error C 'long' size should be either 4 or 8!
3844#endif
3845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846/* Scans a UTF-8 string and returns the maximum character to be expected,
3847 the size of the decoded unicode string and if any major errors were
3848 encountered.
3849
3850 This function does check basic UTF-8 sanity, it does however NOT CHECK
3851 if the string contains surrogates, and if all continuation bytes are
3852 within the correct ranges, these checks are performed in
3853 PyUnicode_DecodeUTF8Stateful.
3854
3855 If it sets has_errors to 1, it means the value of unicode_size and max_char
3856 will be bogus and you should not rely on useful information in them.
3857 */
3858static Py_UCS4
3859utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3860 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3861 int *has_errors)
3862{
3863 Py_ssize_t n;
3864 Py_ssize_t char_count = 0;
3865 Py_UCS4 max_char = 127, new_max;
3866 Py_UCS4 upper_bound;
3867 const unsigned char *p = (const unsigned char *)s;
3868 const unsigned char *end = p + string_size;
3869 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3870 int err = 0;
3871
3872 for (; p < end && !err; ++p, ++char_count) {
3873 /* Only check value if it's not a ASCII char... */
3874 if (*p < 0x80) {
3875 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3876 an explanation. */
3877 if (!((size_t) p & LONG_PTR_MASK)) {
3878 /* Help register allocation */
3879 register const unsigned char *_p = p;
3880 while (_p < aligned_end) {
3881 unsigned long value = *(unsigned long *) _p;
3882 if (value & ASCII_CHAR_MASK)
3883 break;
3884 _p += SIZEOF_LONG;
3885 char_count += SIZEOF_LONG;
3886 }
3887 p = _p;
3888 if (p == end)
3889 break;
3890 }
3891 }
3892 if (*p >= 0x80) {
3893 n = utf8_code_length[*p];
3894 new_max = max_char;
3895 switch (n) {
3896 /* invalid start byte */
3897 case 0:
3898 err = 1;
3899 break;
3900 case 2:
3901 /* Code points between 0x00FF and 0x07FF inclusive.
3902 Approximate the upper bound of the code point,
3903 if this flips over 255 we can be sure it will be more
3904 than 255 and the string will need 2 bytes per code coint,
3905 if it stays under or equal to 255, we can be sure 1 byte
3906 is enough.
3907 ((*p & 0b00011111) << 6) | 0b00111111 */
3908 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3909 if (max_char < upper_bound)
3910 new_max = upper_bound;
3911 /* Ensure we track at least that we left ASCII space. */
3912 if (new_max < 128)
3913 new_max = 128;
3914 break;
3915 case 3:
3916 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3917 always > 255 and <= 65535 and will always need 2 bytes. */
3918 if (max_char < 65535)
3919 new_max = 65535;
3920 break;
3921 case 4:
3922 /* Code point will be above 0xFFFF for sure in this case. */
3923 new_max = 65537;
3924 break;
3925 /* Internal error, this should be caught by the first if */
3926 case 1:
3927 default:
3928 assert(0 && "Impossible case in utf8_max_char_and_size");
3929 err = 1;
3930 }
3931 /* Instead of number of overall bytes for this code point,
3932 n containts the number of following bytes: */
3933 --n;
3934 /* Check if the follow up chars are all valid continuation bytes */
3935 if (n >= 1) {
3936 const unsigned char *cont;
3937 if ((p + n) >= end) {
3938 if (consumed == 0)
3939 /* incomplete data, non-incremental decoding */
3940 err = 1;
3941 break;
3942 }
3943 for (cont = p + 1; cont < (p + n); ++cont) {
3944 if ((*cont & 0xc0) != 0x80) {
3945 err = 1;
3946 break;
3947 }
3948 }
3949 p += n;
3950 }
3951 else
3952 err = 1;
3953 max_char = new_max;
3954 }
3955 }
3956
3957 if (unicode_size)
3958 *unicode_size = char_count;
3959 if (has_errors)
3960 *has_errors = err;
3961 return max_char;
3962}
3963
3964/* Similar to PyUnicode_WRITE but can also write into wstr field
3965 of the legacy unicode representation */
3966#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3967 do { \
3968 const int k_ = (kind); \
3969 if (k_ == PyUnicode_WCHAR_KIND) \
3970 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3971 else if (k_ == PyUnicode_1BYTE_KIND) \
3972 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3973 else if (k_ == PyUnicode_2BYTE_KIND) \
3974 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3975 else \
3976 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3977 } while (0)
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979PyObject *
3980PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 Py_ssize_t size,
3982 const char *errors,
3983 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003987 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003988 Py_ssize_t startinpos;
3989 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003990 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003992 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 PyObject *errorHandler = NULL;
3994 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 Py_UCS4 maxchar = 0;
3996 Py_ssize_t unicode_size;
3997 Py_ssize_t i;
3998 int kind;
3999 void *data;
4000 int has_errors;
4001 Py_UNICODE *error_outptr;
4002#if SIZEOF_WCHAR_T == 2
4003 Py_ssize_t wchar_offset = 0;
4004#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005
Walter Dörwald69652032004-09-07 20:24:22 +00004006 if (size == 0) {
4007 if (consumed)
4008 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4012 consumed, &has_errors);
4013 if (has_errors) {
4014 unicode = _PyUnicode_New(size);
4015 if (!unicode)
4016 return NULL;
4017 kind = PyUnicode_WCHAR_KIND;
4018 data = PyUnicode_AS_UNICODE(unicode);
4019 assert(data != NULL);
4020 }
4021 else {
4022 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4023 if (!unicode)
4024 return NULL;
4025 /* When the string is ASCII only, just use memcpy and return.
4026 unicode_size may be != size if there is an incomplete UTF-8
4027 sequence at the end of the ASCII block. */
4028 if (maxchar < 128 && size == unicode_size) {
4029 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4030 return (PyObject *)unicode;
4031 }
4032 kind = PyUnicode_KIND(unicode);
4033 data = PyUnicode_DATA(unicode);
4034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004038 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039
4040 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004041 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
4043 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004044 /* Fast path for runs of ASCII characters. Given that common UTF-8
4045 input will consist of an overwhelming majority of ASCII
4046 characters, we try to optimize for this case by checking
4047 as many characters as a C 'long' can contain.
4048 First, check if we can do an aligned read, as most CPUs have
4049 a penalty for unaligned reads.
4050 */
4051 if (!((size_t) s & LONG_PTR_MASK)) {
4052 /* Help register allocation */
4053 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004055 while (_s < aligned_end) {
4056 /* Read a whole long at a time (either 4 or 8 bytes),
4057 and do a fast unrolled copy if it only contains ASCII
4058 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 unsigned long value = *(unsigned long *) _s;
4060 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004061 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4063 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4064 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4065 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004066#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4068 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4069 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4070 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004071#endif
4072 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004074 }
4075 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004077 if (s == e)
4078 break;
4079 ch = (unsigned char)*s;
4080 }
4081 }
4082
4083 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 s++;
4086 continue;
4087 }
4088
4089 n = utf8_code_length[ch];
4090
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004091 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 if (consumed)
4093 break;
4094 else {
4095 errmsg = "unexpected end of data";
4096 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004097 endinpos = startinpos+1;
4098 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4099 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 goto utf8Error;
4101 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103
4104 switch (n) {
4105
4106 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 startinpos = s-starts;
4109 endinpos = startinpos+1;
4110 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111
4112 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004113 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 startinpos = s-starts;
4115 endinpos = startinpos+1;
4116 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117
4118 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004119 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004120 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004122 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 goto utf8Error;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004126 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 break;
4129
4130 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004131 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4132 will result in surrogates in range d800-dfff. Surrogates are
4133 not valid UTF-8 so they are rejected.
4134 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4135 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004136 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004137 (s[2] & 0xc0) != 0x80 ||
4138 ((unsigned char)s[0] == 0xE0 &&
4139 (unsigned char)s[1] < 0xA0) ||
4140 ((unsigned char)s[0] == 0xED &&
4141 (unsigned char)s[1] > 0x9F)) {
4142 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004144 endinpos = startinpos + 1;
4145
4146 /* if s[1] first two bits are 1 and 0, then the invalid
4147 continuation byte is s[2], so increment endinpos by 1,
4148 if not, s[1] is invalid and endinpos doesn't need to
4149 be incremented. */
4150 if ((s[1] & 0xC0) == 0x80)
4151 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 goto utf8Error;
4153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004155 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004156 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004157 break;
4158
4159 case 4:
4160 if ((s[1] & 0xc0) != 0x80 ||
4161 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004162 (s[3] & 0xc0) != 0x80 ||
4163 ((unsigned char)s[0] == 0xF0 &&
4164 (unsigned char)s[1] < 0x90) ||
4165 ((unsigned char)s[0] == 0xF4 &&
4166 (unsigned char)s[1] > 0x8F)) {
4167 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004169 endinpos = startinpos + 1;
4170 if ((s[1] & 0xC0) == 0x80) {
4171 endinpos++;
4172 if ((s[2] & 0xC0) == 0x80)
4173 endinpos++;
4174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 goto utf8Error;
4176 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004177 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004178 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4179 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 /* If the string is flexible or we have native UCS-4, write
4182 directly.. */
4183 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4184 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186 else {
4187 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 /* translate from 10000..10FFFF to 0..FFFF */
4190 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 /* high surrogate = top 10 bits added to D800 */
4193 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4194 (Py_UNICODE)(0xD800 + (ch >> 10)));
4195
4196 /* low surrogate = bottom 10 bits added to DC00 */
4197 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4198 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4199 }
4200#if SIZEOF_WCHAR_T == 2
4201 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004202#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 }
4205 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 /* If this is not yet a resizable string, make it one.. */
4210 if (kind != PyUnicode_WCHAR_KIND) {
4211 const Py_UNICODE *u;
4212 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4213 if (!new_unicode)
4214 goto onError;
4215 u = PyUnicode_AsUnicode((PyObject *)unicode);
4216 if (!u)
4217 goto onError;
4218#if SIZEOF_WCHAR_T == 2
4219 i += wchar_offset;
4220#endif
4221 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4222 Py_DECREF(unicode);
4223 unicode = new_unicode;
4224 kind = 0;
4225 data = PyUnicode_AS_UNICODE(new_unicode);
4226 assert(data != NULL);
4227 }
4228 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 if (unicode_decode_call_errorhandler(
4230 errors, &errorHandler,
4231 "utf8", errmsg,
4232 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235 /* Update data because unicode_decode_call_errorhandler might have
4236 re-created or resized the unicode object. */
4237 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 /* Ensure the unicode_size calculation above was correct: */
4241 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4242
Walter Dörwald69652032004-09-07 20:24:22 +00004243 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 /* Adjust length and ready string when it contained errors and
4247 is of the old resizable kind. */
4248 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004249 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250 goto onError;
4251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 Py_XDECREF(errorHandler);
4254 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004255#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004256 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004257 Py_DECREF(unicode);
4258 return NULL;
4259 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004260#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 return (PyObject *)unicode;
4262
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 Py_XDECREF(errorHandler);
4265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 Py_DECREF(unicode);
4267 return NULL;
4268}
4269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004271
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004272#ifdef __APPLE__
4273
4274/* Simplified UTF-8 decoder using surrogateescape error handler,
4275 used to decode the command line arguments on Mac OS X. */
4276
4277wchar_t*
4278_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4279{
4280 int n;
4281 const char *e;
4282 wchar_t *unicode, *p;
4283
4284 /* Note: size will always be longer than the resulting Unicode
4285 character count */
4286 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4287 PyErr_NoMemory();
4288 return NULL;
4289 }
4290 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4291 if (!unicode)
4292 return NULL;
4293
4294 /* Unpack UTF-8 encoded data */
4295 p = unicode;
4296 e = s + size;
4297 while (s < e) {
4298 Py_UCS4 ch = (unsigned char)*s;
4299
4300 if (ch < 0x80) {
4301 *p++ = (wchar_t)ch;
4302 s++;
4303 continue;
4304 }
4305
4306 n = utf8_code_length[ch];
4307 if (s + n > e) {
4308 goto surrogateescape;
4309 }
4310
4311 switch (n) {
4312 case 0:
4313 case 1:
4314 goto surrogateescape;
4315
4316 case 2:
4317 if ((s[1] & 0xc0) != 0x80)
4318 goto surrogateescape;
4319 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4320 assert ((ch > 0x007F) && (ch <= 0x07FF));
4321 *p++ = (wchar_t)ch;
4322 break;
4323
4324 case 3:
4325 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4326 will result in surrogates in range d800-dfff. Surrogates are
4327 not valid UTF-8 so they are rejected.
4328 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4329 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4330 if ((s[1] & 0xc0) != 0x80 ||
4331 (s[2] & 0xc0) != 0x80 ||
4332 ((unsigned char)s[0] == 0xE0 &&
4333 (unsigned char)s[1] < 0xA0) ||
4334 ((unsigned char)s[0] == 0xED &&
4335 (unsigned char)s[1] > 0x9F)) {
4336
4337 goto surrogateescape;
4338 }
4339 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4340 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004342 break;
4343
4344 case 4:
4345 if ((s[1] & 0xc0) != 0x80 ||
4346 (s[2] & 0xc0) != 0x80 ||
4347 (s[3] & 0xc0) != 0x80 ||
4348 ((unsigned char)s[0] == 0xF0 &&
4349 (unsigned char)s[1] < 0x90) ||
4350 ((unsigned char)s[0] == 0xF4 &&
4351 (unsigned char)s[1] > 0x8F)) {
4352 goto surrogateescape;
4353 }
4354 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4355 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4356 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4357
4358#if SIZEOF_WCHAR_T == 4
4359 *p++ = (wchar_t)ch;
4360#else
4361 /* compute and append the two surrogates: */
4362
4363 /* translate from 10000..10FFFF to 0..FFFF */
4364 ch -= 0x10000;
4365
4366 /* high surrogate = top 10 bits added to D800 */
4367 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4368
4369 /* low surrogate = bottom 10 bits added to DC00 */
4370 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4371#endif
4372 break;
4373 }
4374 s += n;
4375 continue;
4376
4377 surrogateescape:
4378 *p++ = 0xDC00 + ch;
4379 s++;
4380 }
4381 *p = L'\0';
4382 return unicode;
4383}
4384
4385#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004387/* Primary internal function which creates utf8 encoded bytes objects.
4388
4389 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004390 and allocate exactly as much space needed at the end. Else allocate the
4391 maximum possible needed (4 result bytes per Unicode character), and return
4392 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004393*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004394PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004395_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396{
Tim Peters602f7402002-04-27 18:03:26 +00004397#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004398
Guido van Rossum98297ee2007-11-06 21:34:58 +00004399 Py_ssize_t i; /* index into s of next input byte */
4400 PyObject *result; /* result string object */
4401 char *p; /* next free byte in output buffer */
4402 Py_ssize_t nallocated; /* number of result bytes allocated */
4403 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004404 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004405 PyObject *errorHandler = NULL;
4406 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004407 int kind;
4408 void *data;
4409 Py_ssize_t size;
4410 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4411#if SIZEOF_WCHAR_T == 2
4412 Py_ssize_t wchar_offset = 0;
4413#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004415 if (!PyUnicode_Check(unicode)) {
4416 PyErr_BadArgument();
4417 return NULL;
4418 }
4419
4420 if (PyUnicode_READY(unicode) == -1)
4421 return NULL;
4422
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004423 if (PyUnicode_UTF8(unicode))
4424 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4425 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004426
4427 kind = PyUnicode_KIND(unicode);
4428 data = PyUnicode_DATA(unicode);
4429 size = PyUnicode_GET_LENGTH(unicode);
4430
Tim Peters602f7402002-04-27 18:03:26 +00004431 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432
Tim Peters602f7402002-04-27 18:03:26 +00004433 if (size <= MAX_SHORT_UNICHARS) {
4434 /* Write into the stack buffer; nallocated can't overflow.
4435 * At the end, we'll allocate exactly as much heap space as it
4436 * turns out we need.
4437 */
4438 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004439 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004440 p = stackbuf;
4441 }
4442 else {
4443 /* Overallocate on the heap, and give the excess back at the end. */
4444 nallocated = size * 4;
4445 if (nallocated / 4 != size) /* overflow! */
4446 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004447 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004448 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004449 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004450 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004451 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004452
Tim Peters602f7402002-04-27 18:03:26 +00004453 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004454 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004455
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004456 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004457 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004461 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004462 *p++ = (char)(0xc0 | (ch >> 6));
4463 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004464 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004465 Py_ssize_t newpos;
4466 PyObject *rep;
4467 Py_ssize_t repsize, k, startpos;
4468 startpos = i-1;
4469#if SIZEOF_WCHAR_T == 2
4470 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004471#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 rep = unicode_encode_call_errorhandler(
4473 errors, &errorHandler, "utf-8", "surrogates not allowed",
4474 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4475 &exc, startpos, startpos+1, &newpos);
4476 if (!rep)
4477 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004479 if (PyBytes_Check(rep))
4480 repsize = PyBytes_GET_SIZE(rep);
4481 else
4482 repsize = PyUnicode_GET_SIZE(rep);
4483
4484 if (repsize > 4) {
4485 Py_ssize_t offset;
4486
4487 if (result == NULL)
4488 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004489 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004492 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4493 /* integer overflow */
4494 PyErr_NoMemory();
4495 goto error;
4496 }
4497 nallocated += repsize - 4;
4498 if (result != NULL) {
4499 if (_PyBytes_Resize(&result, nallocated) < 0)
4500 goto error;
4501 } else {
4502 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004503 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004504 goto error;
4505 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4506 }
4507 p = PyBytes_AS_STRING(result) + offset;
4508 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004510 if (PyBytes_Check(rep)) {
4511 char *prep = PyBytes_AS_STRING(rep);
4512 for(k = repsize; k > 0; k--)
4513 *p++ = *prep++;
4514 } else /* rep is unicode */ {
4515 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4516 Py_UNICODE c;
4517
4518 for(k=0; k<repsize; k++) {
4519 c = prep[k];
4520 if (0x80 <= c) {
4521 raise_encode_exception(&exc, "utf-8",
4522 PyUnicode_AS_UNICODE(unicode),
4523 size, i-1, i,
4524 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004525 goto error;
4526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004528 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004531 } else if (ch < 0x10000) {
4532 *p++ = (char)(0xe0 | (ch >> 12));
4533 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4534 *p++ = (char)(0x80 | (ch & 0x3f));
4535 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004536 /* Encode UCS4 Unicode ordinals */
4537 *p++ = (char)(0xf0 | (ch >> 18));
4538 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4539 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4540 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004541#if SIZEOF_WCHAR_T == 2
4542 wchar_offset++;
4543#endif
Tim Peters602f7402002-04-27 18:03:26 +00004544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004546
Guido van Rossum98297ee2007-11-06 21:34:58 +00004547 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004548 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004549 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004550 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004551 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004552 }
4553 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004554 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004555 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004556 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004557 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004559
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004560 Py_XDECREF(errorHandler);
4561 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004562 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004563 error:
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 Py_XDECREF(result);
4567 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004568
Tim Peters602f7402002-04-27 18:03:26 +00004569#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570}
4571
Alexander Belopolsky40018472011-02-26 01:02:56 +00004572PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004573PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4574 Py_ssize_t size,
4575 const char *errors)
4576{
4577 PyObject *v, *unicode;
4578
4579 unicode = PyUnicode_FromUnicode(s, size);
4580 if (unicode == NULL)
4581 return NULL;
4582 v = _PyUnicode_AsUTF8String(unicode, errors);
4583 Py_DECREF(unicode);
4584 return v;
4585}
4586
4587PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004590 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591}
4592
Walter Dörwald41980ca2007-08-16 21:55:45 +00004593/* --- UTF-32 Codec ------------------------------------------------------- */
4594
4595PyObject *
4596PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 Py_ssize_t size,
4598 const char *errors,
4599 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004600{
4601 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4602}
4603
4604PyObject *
4605PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 Py_ssize_t size,
4607 const char *errors,
4608 int *byteorder,
4609 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610{
4611 const char *starts = s;
4612 Py_ssize_t startinpos;
4613 Py_ssize_t endinpos;
4614 Py_ssize_t outpos;
4615 PyUnicodeObject *unicode;
4616 Py_UNICODE *p;
4617#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004618 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004619 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004620#else
4621 const int pairs = 0;
4622#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004623 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004624 int bo = 0; /* assume native ordering by default */
4625 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004626 /* Offsets from q for retrieving bytes in the right order. */
4627#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4628 int iorder[] = {0, 1, 2, 3};
4629#else
4630 int iorder[] = {3, 2, 1, 0};
4631#endif
4632 PyObject *errorHandler = NULL;
4633 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004634
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635 q = (unsigned char *)s;
4636 e = q + size;
4637
4638 if (byteorder)
4639 bo = *byteorder;
4640
4641 /* Check for BOM marks (U+FEFF) in the input and adjust current
4642 byte order setting accordingly. In native mode, the leading BOM
4643 mark is skipped, in all other modes, it is copied to the output
4644 stream as-is (giving a ZWNBSP character). */
4645 if (bo == 0) {
4646 if (size >= 4) {
4647 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004649#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 if (bom == 0x0000FEFF) {
4651 q += 4;
4652 bo = -1;
4653 }
4654 else if (bom == 0xFFFE0000) {
4655 q += 4;
4656 bo = 1;
4657 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004658#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 if (bom == 0x0000FEFF) {
4660 q += 4;
4661 bo = 1;
4662 }
4663 else if (bom == 0xFFFE0000) {
4664 q += 4;
4665 bo = -1;
4666 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004667#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004669 }
4670
4671 if (bo == -1) {
4672 /* force LE */
4673 iorder[0] = 0;
4674 iorder[1] = 1;
4675 iorder[2] = 2;
4676 iorder[3] = 3;
4677 }
4678 else if (bo == 1) {
4679 /* force BE */
4680 iorder[0] = 3;
4681 iorder[1] = 2;
4682 iorder[2] = 1;
4683 iorder[3] = 0;
4684 }
4685
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004686 /* On narrow builds we split characters outside the BMP into two
4687 codepoints => count how much extra space we need. */
4688#ifndef Py_UNICODE_WIDE
4689 for (qq = q; qq < e; qq += 4)
4690 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4691 pairs++;
4692#endif
4693
4694 /* This might be one to much, because of a BOM */
4695 unicode = _PyUnicode_New((size+3)/4+pairs);
4696 if (!unicode)
4697 return NULL;
4698 if (size == 0)
4699 return (PyObject *)unicode;
4700
4701 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004703
Walter Dörwald41980ca2007-08-16 21:55:45 +00004704 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 Py_UCS4 ch;
4706 /* remaining bytes at the end? (size should be divisible by 4) */
4707 if (e-q<4) {
4708 if (consumed)
4709 break;
4710 errmsg = "truncated data";
4711 startinpos = ((const char *)q)-starts;
4712 endinpos = ((const char *)e)-starts;
4713 goto utf32Error;
4714 /* The remaining input chars are ignored if the callback
4715 chooses to skip the input */
4716 }
4717 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4718 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 if (ch >= 0x110000)
4721 {
4722 errmsg = "codepoint not in range(0x110000)";
4723 startinpos = ((const char *)q)-starts;
4724 endinpos = startinpos+4;
4725 goto utf32Error;
4726 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004727#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 if (ch >= 0x10000)
4729 {
4730 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4731 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4732 }
4733 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004734#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 *p++ = ch;
4736 q += 4;
4737 continue;
4738 utf32Error:
4739 outpos = p-PyUnicode_AS_UNICODE(unicode);
4740 if (unicode_decode_call_errorhandler(
4741 errors, &errorHandler,
4742 "utf32", errmsg,
4743 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4744 &unicode, &outpos, &p))
4745 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746 }
4747
4748 if (byteorder)
4749 *byteorder = bo;
4750
4751 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753
4754 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004755 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756 goto onError;
4757
4758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004760#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004761 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 Py_DECREF(unicode);
4763 return NULL;
4764 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004765#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004766 return (PyObject *)unicode;
4767
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004769 Py_DECREF(unicode);
4770 Py_XDECREF(errorHandler);
4771 Py_XDECREF(exc);
4772 return NULL;
4773}
4774
4775PyObject *
4776PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 Py_ssize_t size,
4778 const char *errors,
4779 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004780{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004781 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004782 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004783 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004784#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004785 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004786#else
4787 const int pairs = 0;
4788#endif
4789 /* Offsets from p for storing byte pairs in the right order. */
4790#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4791 int iorder[] = {0, 1, 2, 3};
4792#else
4793 int iorder[] = {3, 2, 1, 0};
4794#endif
4795
Benjamin Peterson29060642009-01-31 22:14:21 +00004796#define STORECHAR(CH) \
4797 do { \
4798 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4799 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4800 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4801 p[iorder[0]] = (CH) & 0xff; \
4802 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004803 } while(0)
4804
4805 /* In narrow builds we can output surrogate pairs as one codepoint,
4806 so we need less space. */
4807#ifndef Py_UNICODE_WIDE
4808 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4810 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4811 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004813 nsize = (size - pairs + (byteorder == 0));
4814 bytesize = nsize * 4;
4815 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004817 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004818 if (v == NULL)
4819 return NULL;
4820
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004821 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004825 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004826
4827 if (byteorder == -1) {
4828 /* force LE */
4829 iorder[0] = 0;
4830 iorder[1] = 1;
4831 iorder[2] = 2;
4832 iorder[3] = 3;
4833 }
4834 else if (byteorder == 1) {
4835 /* force BE */
4836 iorder[0] = 3;
4837 iorder[1] = 2;
4838 iorder[2] = 1;
4839 iorder[3] = 0;
4840 }
4841
4842 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004844#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4846 Py_UCS4 ch2 = *s;
4847 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4848 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4849 s++;
4850 size--;
4851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004852 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004853#endif
4854 STORECHAR(ch);
4855 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004856
4857 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004858 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004859#undef STORECHAR
4860}
4861
Alexander Belopolsky40018472011-02-26 01:02:56 +00004862PyObject *
4863PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864{
4865 if (!PyUnicode_Check(unicode)) {
4866 PyErr_BadArgument();
4867 return NULL;
4868 }
4869 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 PyUnicode_GET_SIZE(unicode),
4871 NULL,
4872 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873}
4874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875/* --- UTF-16 Codec ------------------------------------------------------- */
4876
Tim Peters772747b2001-08-09 22:21:55 +00004877PyObject *
4878PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 Py_ssize_t size,
4880 const char *errors,
4881 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Walter Dörwald69652032004-09-07 20:24:22 +00004883 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4884}
4885
Antoine Pitrouab868312009-01-10 15:40:25 +00004886/* Two masks for fast checking of whether a C 'long' may contain
4887 UTF16-encoded surrogate characters. This is an efficient heuristic,
4888 assuming that non-surrogate characters with a code point >= 0x8000 are
4889 rare in most input.
4890 FAST_CHAR_MASK is used when the input is in native byte ordering,
4891 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004892*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004893#if (SIZEOF_LONG == 8)
4894# define FAST_CHAR_MASK 0x8000800080008000L
4895# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4896#elif (SIZEOF_LONG == 4)
4897# define FAST_CHAR_MASK 0x80008000L
4898# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4899#else
4900# error C 'long' size should be either 4 or 8!
4901#endif
4902
Walter Dörwald69652032004-09-07 20:24:22 +00004903PyObject *
4904PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 Py_ssize_t size,
4906 const char *errors,
4907 int *byteorder,
4908 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004911 Py_ssize_t startinpos;
4912 Py_ssize_t endinpos;
4913 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 PyUnicodeObject *unicode;
4915 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004916 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004917 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004918 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004919 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004920 /* Offsets from q for retrieving byte pairs in the right order. */
4921#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4922 int ihi = 1, ilo = 0;
4923#else
4924 int ihi = 0, ilo = 1;
4925#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 PyObject *errorHandler = NULL;
4927 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928
4929 /* Note: size will always be longer than the resulting Unicode
4930 character count */
4931 unicode = _PyUnicode_New(size);
4932 if (!unicode)
4933 return NULL;
4934 if (size == 0)
4935 return (PyObject *)unicode;
4936
4937 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004938 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004939 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004940 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941
4942 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004943 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004945 /* Check for BOM marks (U+FEFF) in the input and adjust current
4946 byte order setting accordingly. In native mode, the leading BOM
4947 mark is skipped, in all other modes, it is copied to the output
4948 stream as-is (giving a ZWNBSP character). */
4949 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004950 if (size >= 2) {
4951 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004952#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 if (bom == 0xFEFF) {
4954 q += 2;
4955 bo = -1;
4956 }
4957 else if (bom == 0xFFFE) {
4958 q += 2;
4959 bo = 1;
4960 }
Tim Petersced69f82003-09-16 20:30:58 +00004961#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 if (bom == 0xFEFF) {
4963 q += 2;
4964 bo = 1;
4965 }
4966 else if (bom == 0xFFFE) {
4967 q += 2;
4968 bo = -1;
4969 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004970#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973
Tim Peters772747b2001-08-09 22:21:55 +00004974 if (bo == -1) {
4975 /* force LE */
4976 ihi = 1;
4977 ilo = 0;
4978 }
4979 else if (bo == 1) {
4980 /* force BE */
4981 ihi = 0;
4982 ilo = 1;
4983 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4985 native_ordering = ilo < ihi;
4986#else
4987 native_ordering = ilo > ihi;
4988#endif
Tim Peters772747b2001-08-09 22:21:55 +00004989
Antoine Pitrouab868312009-01-10 15:40:25 +00004990 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004991 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004993 /* First check for possible aligned read of a C 'long'. Unaligned
4994 reads are more expensive, better to defer to another iteration. */
4995 if (!((size_t) q & LONG_PTR_MASK)) {
4996 /* Fast path for runs of non-surrogate chars. */
4997 register const unsigned char *_q = q;
4998 Py_UNICODE *_p = p;
4999 if (native_ordering) {
5000 /* Native ordering is simple: as long as the input cannot
5001 possibly contain a surrogate char, do an unrolled copy
5002 of several 16-bit code points to the target object.
5003 The non-surrogate check is done on several input bytes
5004 at a time (as many as a C 'long' can contain). */
5005 while (_q < aligned_end) {
5006 unsigned long data = * (unsigned long *) _q;
5007 if (data & FAST_CHAR_MASK)
5008 break;
5009 _p[0] = ((unsigned short *) _q)[0];
5010 _p[1] = ((unsigned short *) _q)[1];
5011#if (SIZEOF_LONG == 8)
5012 _p[2] = ((unsigned short *) _q)[2];
5013 _p[3] = ((unsigned short *) _q)[3];
5014#endif
5015 _q += SIZEOF_LONG;
5016 _p += SIZEOF_LONG / 2;
5017 }
5018 }
5019 else {
5020 /* Byteswapped ordering is similar, but we must decompose
5021 the copy bytewise, and take care of zero'ing out the
5022 upper bytes if the target object is in 32-bit units
5023 (that is, in UCS-4 builds). */
5024 while (_q < aligned_end) {
5025 unsigned long data = * (unsigned long *) _q;
5026 if (data & SWAPPED_FAST_CHAR_MASK)
5027 break;
5028 /* Zero upper bytes in UCS-4 builds */
5029#if (Py_UNICODE_SIZE > 2)
5030 _p[0] = 0;
5031 _p[1] = 0;
5032#if (SIZEOF_LONG == 8)
5033 _p[2] = 0;
5034 _p[3] = 0;
5035#endif
5036#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005037 /* Issue #4916; UCS-4 builds on big endian machines must
5038 fill the two last bytes of each 4-byte unit. */
5039#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5040# define OFF 2
5041#else
5042# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005043#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005044 ((unsigned char *) _p)[OFF + 1] = _q[0];
5045 ((unsigned char *) _p)[OFF + 0] = _q[1];
5046 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5047 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5048#if (SIZEOF_LONG == 8)
5049 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5050 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5051 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5052 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5053#endif
5054#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005055 _q += SIZEOF_LONG;
5056 _p += SIZEOF_LONG / 2;
5057 }
5058 }
5059 p = _p;
5060 q = _q;
5061 if (q >= e)
5062 break;
5063 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065
Benjamin Peterson14339b62009-01-31 16:36:08 +00005066 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005067
5068 if (ch < 0xD800 || ch > 0xDFFF) {
5069 *p++ = ch;
5070 continue;
5071 }
5072
5073 /* UTF-16 code pair: */
5074 if (q > e) {
5075 errmsg = "unexpected end of data";
5076 startinpos = (((const char *)q) - 2) - starts;
5077 endinpos = ((const char *)e) + 1 - starts;
5078 goto utf16Error;
5079 }
5080 if (0xD800 <= ch && ch <= 0xDBFF) {
5081 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5082 q += 2;
5083 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005084#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 *p++ = ch;
5086 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005087#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005089#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 continue;
5091 }
5092 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005093 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 startinpos = (((const char *)q)-4)-starts;
5095 endinpos = startinpos+2;
5096 goto utf16Error;
5097 }
5098
Benjamin Peterson14339b62009-01-31 16:36:08 +00005099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 errmsg = "illegal encoding";
5101 startinpos = (((const char *)q)-2)-starts;
5102 endinpos = startinpos+2;
5103 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005104
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 utf16Error:
5106 outpos = p - PyUnicode_AS_UNICODE(unicode);
5107 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005108 errors,
5109 &errorHandler,
5110 "utf16", errmsg,
5111 &starts,
5112 (const char **)&e,
5113 &startinpos,
5114 &endinpos,
5115 &exc,
5116 (const char **)&q,
5117 &unicode,
5118 &outpos,
5119 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005122 /* remaining byte at the end? (size should be even) */
5123 if (e == q) {
5124 if (!consumed) {
5125 errmsg = "truncated data";
5126 startinpos = ((const char *)q) - starts;
5127 endinpos = ((const char *)e) + 1 - starts;
5128 outpos = p - PyUnicode_AS_UNICODE(unicode);
5129 if (unicode_decode_call_errorhandler(
5130 errors,
5131 &errorHandler,
5132 "utf16", errmsg,
5133 &starts,
5134 (const char **)&e,
5135 &startinpos,
5136 &endinpos,
5137 &exc,
5138 (const char **)&q,
5139 &unicode,
5140 &outpos,
5141 &p))
5142 goto onError;
5143 /* The remaining input chars are ignored if the callback
5144 chooses to skip the input */
5145 }
5146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
5148 if (byteorder)
5149 *byteorder = bo;
5150
Walter Dörwald69652032004-09-07 20:24:22 +00005151 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005153
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005155 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 goto onError;
5157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 Py_XDECREF(errorHandler);
5159 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005160#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005161 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162 Py_DECREF(unicode);
5163 return NULL;
5164 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005165#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return (PyObject *)unicode;
5167
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170 Py_XDECREF(errorHandler);
5171 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 return NULL;
5173}
5174
Antoine Pitrouab868312009-01-10 15:40:25 +00005175#undef FAST_CHAR_MASK
5176#undef SWAPPED_FAST_CHAR_MASK
5177
Tim Peters772747b2001-08-09 22:21:55 +00005178PyObject *
5179PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_ssize_t size,
5181 const char *errors,
5182 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005184 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005185 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005186 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005187#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005188 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005189#else
5190 const int pairs = 0;
5191#endif
Tim Peters772747b2001-08-09 22:21:55 +00005192 /* Offsets from p for storing byte pairs in the right order. */
5193#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5194 int ihi = 1, ilo = 0;
5195#else
5196 int ihi = 0, ilo = 1;
5197#endif
5198
Benjamin Peterson29060642009-01-31 22:14:21 +00005199#define STORECHAR(CH) \
5200 do { \
5201 p[ihi] = ((CH) >> 8) & 0xff; \
5202 p[ilo] = (CH) & 0xff; \
5203 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005204 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005206#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005207 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 if (s[i] >= 0x10000)
5209 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005210#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005211 /* 2 * (size + pairs + (byteorder == 0)) */
5212 if (size > PY_SSIZE_T_MAX ||
5213 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005215 nsize = size + pairs + (byteorder == 0);
5216 bytesize = nsize * 2;
5217 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005219 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 if (v == NULL)
5221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005223 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005226 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005227 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005228
5229 if (byteorder == -1) {
5230 /* force LE */
5231 ihi = 1;
5232 ilo = 0;
5233 }
5234 else if (byteorder == 1) {
5235 /* force BE */
5236 ihi = 0;
5237 ilo = 1;
5238 }
5239
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005240 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 Py_UNICODE ch = *s++;
5242 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005243#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if (ch >= 0x10000) {
5245 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5246 ch = 0xD800 | ((ch-0x10000) >> 10);
5247 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005248#endif
Tim Peters772747b2001-08-09 22:21:55 +00005249 STORECHAR(ch);
5250 if (ch2)
5251 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005252 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005253
5254 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005255 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005256#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257}
5258
Alexander Belopolsky40018472011-02-26 01:02:56 +00005259PyObject *
5260PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261{
5262 if (!PyUnicode_Check(unicode)) {
5263 PyErr_BadArgument();
5264 return NULL;
5265 }
5266 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 PyUnicode_GET_SIZE(unicode),
5268 NULL,
5269 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270}
5271
5272/* --- Unicode Escape Codec ----------------------------------------------- */
5273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005274/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5275 if all the escapes in the string make it still a valid ASCII string.
5276 Returns -1 if any escapes were found which cause the string to
5277 pop out of ASCII range. Otherwise returns the length of the
5278 required buffer to hold the string.
5279 */
5280Py_ssize_t
5281length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5282{
5283 const unsigned char *p = (const unsigned char *)s;
5284 const unsigned char *end = p + size;
5285 Py_ssize_t length = 0;
5286
5287 if (size < 0)
5288 return -1;
5289
5290 for (; p < end; ++p) {
5291 if (*p > 127) {
5292 /* Non-ASCII */
5293 return -1;
5294 }
5295 else if (*p != '\\') {
5296 /* Normal character */
5297 ++length;
5298 }
5299 else {
5300 /* Backslash-escape, check next char */
5301 ++p;
5302 /* Escape sequence reaches till end of string or
5303 non-ASCII follow-up. */
5304 if (p >= end || *p > 127)
5305 return -1;
5306 switch (*p) {
5307 case '\n':
5308 /* backslash + \n result in zero characters */
5309 break;
5310 case '\\': case '\'': case '\"':
5311 case 'b': case 'f': case 't':
5312 case 'n': case 'r': case 'v': case 'a':
5313 ++length;
5314 break;
5315 case '0': case '1': case '2': case '3':
5316 case '4': case '5': case '6': case '7':
5317 case 'x': case 'u': case 'U': case 'N':
5318 /* these do not guarantee ASCII characters */
5319 return -1;
5320 default:
5321 /* count the backslash + the other character */
5322 length += 2;
5323 }
5324 }
5325 }
5326 return length;
5327}
5328
5329/* Similar to PyUnicode_WRITE but either write into wstr field
5330 or treat string as ASCII. */
5331#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5332 do { \
5333 if ((kind) != PyUnicode_WCHAR_KIND) \
5334 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5335 else \
5336 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5337 } while (0)
5338
5339#define WRITE_WSTR(buf, index, value) \
5340 assert(kind == PyUnicode_WCHAR_KIND), \
5341 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5342
5343
Fredrik Lundh06d12682001-01-24 07:59:11 +00005344static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005345
Alexander Belopolsky40018472011-02-26 01:02:56 +00005346PyObject *
5347PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005348 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005349 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t startinpos;
5353 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005358 char* message;
5359 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 PyObject *errorHandler = NULL;
5361 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362 Py_ssize_t ascii_length;
5363 Py_ssize_t i;
5364 int kind;
5365 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 ascii_length = length_of_escaped_ascii_string(s, size);
5368
5369 /* After length_of_escaped_ascii_string() there are two alternatives,
5370 either the string is pure ASCII with named escapes like \n, etc.
5371 and we determined it's exact size (common case)
5372 or it contains \x, \u, ... escape sequences. then we create a
5373 legacy wchar string and resize it at the end of this function. */
5374 if (ascii_length >= 0) {
5375 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5376 if (!v)
5377 goto onError;
5378 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5379 kind = PyUnicode_1BYTE_KIND;
5380 data = PyUnicode_DATA(v);
5381 }
5382 else {
5383 /* Escaped strings will always be longer than the resulting
5384 Unicode string, so we start with size here and then reduce the
5385 length after conversion to the true value.
5386 (but if the error callback returns a long replacement string
5387 we'll have to allocate more space) */
5388 v = _PyUnicode_New(size);
5389 if (!v)
5390 goto onError;
5391 kind = PyUnicode_WCHAR_KIND;
5392 data = PyUnicode_AS_UNICODE(v);
5393 }
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (size == 0)
5396 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 while (s < end) {
5401 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005402 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 if (kind == PyUnicode_WCHAR_KIND) {
5406 assert(i < _PyUnicode_WSTR_LENGTH(v));
5407 }
5408 else {
5409 /* The only case in which i == ascii_length is a backslash
5410 followed by a newline. */
5411 assert(i <= ascii_length);
5412 }
5413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 /* Non-escape characters are interpreted as Unicode ordinals */
5415 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 continue;
5418 }
5419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005420 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 /* \ - Escapes */
5422 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005423 c = *s++;
5424 if (s > end)
5425 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426
5427 if (kind == PyUnicode_WCHAR_KIND) {
5428 assert(i < _PyUnicode_WSTR_LENGTH(v));
5429 }
5430 else {
5431 /* The only case in which i == ascii_length is a backslash
5432 followed by a newline. */
5433 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5434 }
5435
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005436 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5441 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5442 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5443 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5444 /* FF */
5445 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5446 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5447 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5448 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5449 /* VT */
5450 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5451 /* BEL, not classic C */
5452 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 case '0': case '1': case '2': case '3':
5456 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005457 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005458 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005459 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005460 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005461 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 break;
5465
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 /* hex escapes */
5467 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005469 digits = 2;
5470 message = "truncated \\xXX escape";
5471 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005475 digits = 4;
5476 message = "truncated \\uXXXX escape";
5477 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005480 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005481 digits = 8;
5482 message = "truncated \\UXXXXXXXX escape";
5483 hexescape:
5484 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 if (s+digits>end) {
5487 endinpos = size;
5488 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 errors, &errorHandler,
5490 "unicodeescape", "end of string in escape sequence",
5491 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 goto nextByte;
5496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005497 for (j = 0; j < digits; ++j) {
5498 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005499 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500 endinpos = (s+j+1)-starts;
5501 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 errors, &errorHandler,
5504 "unicodeescape", message,
5505 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005506 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005507 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005508 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005510 }
5511 chr = (chr<<4) & ~0xF;
5512 if (c >= '0' && c <= '9')
5513 chr += c - '0';
5514 else if (c >= 'a' && c <= 'f')
5515 chr += 10 + c - 'a';
5516 else
5517 chr += 10 + c - 'A';
5518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005520 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 /* _decoding_error will have already written into the
5522 target buffer. */
5523 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005525 /* when we get here, chr is a 32-bit unicode character */
5526 if (chr <= 0xffff)
5527 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005530 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005531 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005532#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005534#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5537 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005538#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005539 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 errors, &errorHandler,
5544 "unicodeescape", "illegal Unicode character",
5545 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005547 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005550 break;
5551
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553 case 'N':
5554 message = "malformed \\N character escape";
5555 if (ucnhash_CAPI == NULL) {
5556 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5558 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005559 if (ucnhash_CAPI == NULL)
5560 goto ucnhashError;
5561 }
5562 if (*s == '{') {
5563 const char *start = s+1;
5564 /* look for the closing brace */
5565 while (*s != '}' && s < end)
5566 s++;
5567 if (s > start && s < end && *s == '}') {
5568 /* found a name. look it up in the unicode database */
5569 message = "unknown Unicode character name";
5570 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5572 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005573 goto store;
5574 }
5575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 errors, &errorHandler,
5580 "unicodeescape", message,
5581 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005585 break;
5586
5587 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005588 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 message = "\\ at end of string";
5591 s--;
5592 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 errors, &errorHandler,
5596 "unicodeescape", message,
5597 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005599 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005600 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005601 }
5602 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5604 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005605 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005606 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005611 /* Ensure the length prediction worked in case of ASCII strings */
5612 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5613
Victor Stinnerfe226c02011-10-03 03:52:20 +02005614 if (kind == PyUnicode_WCHAR_KIND)
5615 {
5616 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5617 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005618 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005621#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005622 if (_PyUnicode_READY_REPLACE(&v)) {
5623 Py_DECREF(v);
5624 return NULL;
5625 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005626#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005628
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005630 PyErr_SetString(
5631 PyExc_UnicodeError,
5632 "\\N escapes not supported (can't load unicodedata module)"
5633 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005634 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 Py_XDECREF(errorHandler);
5636 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005637 return NULL;
5638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 Py_XDECREF(errorHandler);
5642 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 return NULL;
5644}
5645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646#undef WRITE_ASCII_OR_WSTR
5647#undef WRITE_WSTR
5648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649/* Return a Unicode-Escape string version of the Unicode object.
5650
5651 If quotes is true, the string is enclosed in u"" or u'' quotes as
5652 appropriate.
5653
5654*/
5655
Walter Dörwald79e913e2007-05-12 11:08:06 +00005656static const char *hexdigits = "0123456789abcdef";
5657
Alexander Belopolsky40018472011-02-26 01:02:56 +00005658PyObject *
5659PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005660 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005662 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005665#ifdef Py_UNICODE_WIDE
5666 const Py_ssize_t expandsize = 10;
5667#else
5668 const Py_ssize_t expandsize = 6;
5669#endif
5670
Thomas Wouters89f507f2006-12-13 04:49:30 +00005671 /* XXX(nnorwitz): rather than over-allocating, it would be
5672 better to choose a different scheme. Perhaps scan the
5673 first N-chars of the string and allocate based on that size.
5674 */
5675 /* Initial allocation is based on the longest-possible unichr
5676 escape.
5677
5678 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5679 unichr, so in this case it's the longest unichr escape. In
5680 narrow (UTF-16) builds this is five chars per source unichr
5681 since there are two unichrs in the surrogate pair, so in narrow
5682 (UTF-16) builds it's not the longest unichr escape.
5683
5684 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5685 so in the narrow (UTF-16) build case it's the longest unichr
5686 escape.
5687 */
5688
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005689 if (size == 0)
5690 return PyBytes_FromStringAndSize(NULL, 0);
5691
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005692 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005694
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005695 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 2
5697 + expandsize*size
5698 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 if (repr == NULL)
5700 return NULL;
5701
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005702 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 while (size-- > 0) {
5705 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005706
Walter Dörwald79e913e2007-05-12 11:08:06 +00005707 /* Escape backslashes */
5708 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 *p++ = '\\';
5710 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005711 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005712 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005713
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005714#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005715 /* Map 21-bit characters to '\U00xxxxxx' */
5716 else if (ch >= 0x10000) {
5717 *p++ = '\\';
5718 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005719 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5720 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5721 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5722 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5723 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5724 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5725 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5726 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005728 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005729#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5731 else if (ch >= 0xD800 && ch < 0xDC00) {
5732 Py_UNICODE ch2;
5733 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 ch2 = *s++;
5736 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005737 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5739 *p++ = '\\';
5740 *p++ = 'U';
5741 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5742 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5743 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5744 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5745 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5746 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5747 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5748 *p++ = hexdigits[ucs & 0x0000000F];
5749 continue;
5750 }
5751 /* Fall through: isolated surrogates are copied as-is */
5752 s--;
5753 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005754 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005755#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005756
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005758 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 *p++ = '\\';
5760 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005761 *p++ = hexdigits[(ch >> 12) & 0x000F];
5762 *p++ = hexdigits[(ch >> 8) & 0x000F];
5763 *p++ = hexdigits[(ch >> 4) & 0x000F];
5764 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005766
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005767 /* Map special whitespace to '\t', \n', '\r' */
5768 else if (ch == '\t') {
5769 *p++ = '\\';
5770 *p++ = 't';
5771 }
5772 else if (ch == '\n') {
5773 *p++ = '\\';
5774 *p++ = 'n';
5775 }
5776 else if (ch == '\r') {
5777 *p++ = '\\';
5778 *p++ = 'r';
5779 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005780
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005781 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005782 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005784 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005785 *p++ = hexdigits[(ch >> 4) & 0x000F];
5786 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005787 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 /* Copy everything else as-is */
5790 else
5791 *p++ = (char) ch;
5792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 assert(p - PyBytes_AS_STRING(repr) > 0);
5795 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5796 return NULL;
5797 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798}
5799
Alexander Belopolsky40018472011-02-26 01:02:56 +00005800PyObject *
5801PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005803 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 if (!PyUnicode_Check(unicode)) {
5805 PyErr_BadArgument();
5806 return NULL;
5807 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005808 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5809 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005810 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811}
5812
5813/* --- Raw Unicode Escape Codec ------------------------------------------- */
5814
Alexander Belopolsky40018472011-02-26 01:02:56 +00005815PyObject *
5816PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005817 Py_ssize_t size,
5818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005821 Py_ssize_t startinpos;
5822 Py_ssize_t endinpos;
5823 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005825 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 const char *end;
5827 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 PyObject *errorHandler = NULL;
5829 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 /* Escaped strings will always be longer than the resulting
5832 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 length after conversion to the true value. (But decoding error
5834 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 v = _PyUnicode_New(size);
5836 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 end = s + size;
5842 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 unsigned char c;
5844 Py_UCS4 x;
5845 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005846 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 /* Non-escape characters are interpreted as Unicode ordinals */
5849 if (*s != '\\') {
5850 *p++ = (unsigned char)*s++;
5851 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 startinpos = s-starts;
5854
5855 /* \u-escapes are only interpreted iff the number of leading
5856 backslashes if odd */
5857 bs = s;
5858 for (;s < end;) {
5859 if (*s != '\\')
5860 break;
5861 *p++ = (unsigned char)*s++;
5862 }
5863 if (((s - bs) & 1) == 0 ||
5864 s >= end ||
5865 (*s != 'u' && *s != 'U')) {
5866 continue;
5867 }
5868 p--;
5869 count = *s=='u' ? 4 : 8;
5870 s++;
5871
5872 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5873 outpos = p-PyUnicode_AS_UNICODE(v);
5874 for (x = 0, i = 0; i < count; ++i, ++s) {
5875 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005876 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 endinpos = s-starts;
5878 if (unicode_decode_call_errorhandler(
5879 errors, &errorHandler,
5880 "rawunicodeescape", "truncated \\uXXXX",
5881 &starts, &end, &startinpos, &endinpos, &exc, &s,
5882 &v, &outpos, &p))
5883 goto onError;
5884 goto nextByte;
5885 }
5886 x = (x<<4) & ~0xF;
5887 if (c >= '0' && c <= '9')
5888 x += c - '0';
5889 else if (c >= 'a' && c <= 'f')
5890 x += 10 + c - 'a';
5891 else
5892 x += 10 + c - 'A';
5893 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005894 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 /* UCS-2 character */
5896 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005897 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 /* UCS-4 character. Either store directly, or as
5899 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005900#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005902#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 x -= 0x10000L;
5904 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5905 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005906#endif
5907 } else {
5908 endinpos = s-starts;
5909 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005910 if (unicode_decode_call_errorhandler(
5911 errors, &errorHandler,
5912 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 &starts, &end, &startinpos, &endinpos, &exc, &s,
5914 &v, &outpos, &p))
5915 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 nextByte:
5918 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005920 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922 Py_XDECREF(errorHandler);
5923 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005924#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005925 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005926 Py_DECREF(v);
5927 return NULL;
5928 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005929#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005931
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 Py_XDECREF(errorHandler);
5935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 return NULL;
5937}
5938
Alexander Belopolsky40018472011-02-26 01:02:56 +00005939PyObject *
5940PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005941 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005943 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 char *p;
5945 char *q;
5946
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005947#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005948 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005949#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005950 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005951#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005952
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005953 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005955
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 if (repr == NULL)
5958 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005959 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005962 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 while (size-- > 0) {
5964 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005965#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 /* Map 32-bit characters to '\Uxxxxxxxx' */
5967 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005968 *p++ = '\\';
5969 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005970 *p++ = hexdigits[(ch >> 28) & 0xf];
5971 *p++ = hexdigits[(ch >> 24) & 0xf];
5972 *p++ = hexdigits[(ch >> 20) & 0xf];
5973 *p++ = hexdigits[(ch >> 16) & 0xf];
5974 *p++ = hexdigits[(ch >> 12) & 0xf];
5975 *p++ = hexdigits[(ch >> 8) & 0xf];
5976 *p++ = hexdigits[(ch >> 4) & 0xf];
5977 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005978 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005979 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005980#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5982 if (ch >= 0xD800 && ch < 0xDC00) {
5983 Py_UNICODE ch2;
5984 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005985
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 ch2 = *s++;
5987 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005988 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5990 *p++ = '\\';
5991 *p++ = 'U';
5992 *p++ = hexdigits[(ucs >> 28) & 0xf];
5993 *p++ = hexdigits[(ucs >> 24) & 0xf];
5994 *p++ = hexdigits[(ucs >> 20) & 0xf];
5995 *p++ = hexdigits[(ucs >> 16) & 0xf];
5996 *p++ = hexdigits[(ucs >> 12) & 0xf];
5997 *p++ = hexdigits[(ucs >> 8) & 0xf];
5998 *p++ = hexdigits[(ucs >> 4) & 0xf];
5999 *p++ = hexdigits[ucs & 0xf];
6000 continue;
6001 }
6002 /* Fall through: isolated surrogates are copied as-is */
6003 s--;
6004 size++;
6005 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006006#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 /* Map 16-bit characters to '\uxxxx' */
6008 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 *p++ = '\\';
6010 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006011 *p++ = hexdigits[(ch >> 12) & 0xf];
6012 *p++ = hexdigits[(ch >> 8) & 0xf];
6013 *p++ = hexdigits[(ch >> 4) & 0xf];
6014 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* Copy everything else as-is */
6017 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 *p++ = (char) ch;
6019 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006020 size = p - q;
6021
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006022 assert(size > 0);
6023 if (_PyBytes_Resize(&repr, size) < 0)
6024 return NULL;
6025 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026}
6027
Alexander Belopolsky40018472011-02-26 01:02:56 +00006028PyObject *
6029PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006031 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006033 PyErr_BadArgument();
6034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006036 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6037 PyUnicode_GET_SIZE(unicode));
6038
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006039 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006042/* --- Unicode Internal Codec ------------------------------------------- */
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
6045_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 Py_ssize_t size,
6047 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006048{
6049 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 Py_ssize_t startinpos;
6051 Py_ssize_t endinpos;
6052 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006053 PyUnicodeObject *v;
6054 Py_UNICODE *p;
6055 const char *end;
6056 const char *reason;
6057 PyObject *errorHandler = NULL;
6058 PyObject *exc = NULL;
6059
Neal Norwitzd43069c2006-01-08 01:12:10 +00006060#ifdef Py_UNICODE_WIDE
6061 Py_UNICODE unimax = PyUnicode_GetMax();
6062#endif
6063
Thomas Wouters89f507f2006-12-13 04:49:30 +00006064 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006065 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6066 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006068 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6069 as string was created with the old API. */
6070 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006072 p = PyUnicode_AS_UNICODE(v);
6073 end = s + size;
6074
6075 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 /* We have to sanity check the raw data, otherwise doom looms for
6078 some malformed UCS-4 data. */
6079 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006080#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006081 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006082#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006083 end-s < Py_UNICODE_SIZE
6084 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006086 startinpos = s - starts;
6087 if (end-s < Py_UNICODE_SIZE) {
6088 endinpos = end-starts;
6089 reason = "truncated input";
6090 }
6091 else {
6092 endinpos = s - starts + Py_UNICODE_SIZE;
6093 reason = "illegal code point (> 0x10FFFF)";
6094 }
6095 outpos = p - PyUnicode_AS_UNICODE(v);
6096 if (unicode_decode_call_errorhandler(
6097 errors, &errorHandler,
6098 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006099 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006100 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006101 goto onError;
6102 }
6103 }
6104 else {
6105 p++;
6106 s += Py_UNICODE_SIZE;
6107 }
6108 }
6109
Victor Stinnerfe226c02011-10-03 03:52:20 +02006110 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006111 goto onError;
6112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006114#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006115 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006116 Py_DECREF(v);
6117 return NULL;
6118 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006119#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006120 return (PyObject *)v;
6121
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 Py_XDECREF(v);
6124 Py_XDECREF(errorHandler);
6125 Py_XDECREF(exc);
6126 return NULL;
6127}
6128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129/* --- Latin-1 Codec ------------------------------------------------------ */
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
6132PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006133 Py_ssize_t size,
6134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006137 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138}
6139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141static void
6142make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006143 const char *encoding,
6144 const Py_UNICODE *unicode, Py_ssize_t size,
6145 Py_ssize_t startpos, Py_ssize_t endpos,
6146 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 *exceptionObject = PyUnicodeEncodeError_Create(
6150 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
6152 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6154 goto onError;
6155 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6156 goto onError;
6157 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6158 goto onError;
6159 return;
6160 onError:
6161 Py_DECREF(*exceptionObject);
6162 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
6164}
6165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006166/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006167static void
6168raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006169 const char *encoding,
6170 const Py_UNICODE *unicode, Py_ssize_t size,
6171 Py_ssize_t startpos, Py_ssize_t endpos,
6172 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173{
6174 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178}
6179
6180/* error handling callback helper:
6181 build arguments, call the callback and check the arguments,
6182 put the result into newpos and return the replacement string, which
6183 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006184static PyObject *
6185unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006186 PyObject **errorHandler,
6187 const char *encoding, const char *reason,
6188 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6189 Py_ssize_t startpos, Py_ssize_t endpos,
6190 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006192 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006193
6194 PyObject *restuple;
6195 PyObject *resunicode;
6196
6197 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 }
6202
6203 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207
6208 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006213 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 Py_DECREF(restuple);
6215 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006217 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 &resunicode, newpos)) {
6219 Py_DECREF(restuple);
6220 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006222 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6223 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6224 Py_DECREF(restuple);
6225 return NULL;
6226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006229 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6231 Py_DECREF(restuple);
6232 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006233 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 Py_INCREF(resunicode);
6235 Py_DECREF(restuple);
6236 return resunicode;
6237}
6238
Alexander Belopolsky40018472011-02-26 01:02:56 +00006239static PyObject *
6240unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006241 Py_ssize_t size,
6242 const char *errors,
6243 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244{
6245 /* output object */
6246 PyObject *res;
6247 /* pointers to the beginning and end+1 of input */
6248 const Py_UNICODE *startp = p;
6249 const Py_UNICODE *endp = p + size;
6250 /* pointer to the beginning of the unencodable characters */
6251 /* const Py_UNICODE *badp = NULL; */
6252 /* pointer into the output */
6253 char *str;
6254 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006256 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6257 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 PyObject *errorHandler = NULL;
6259 PyObject *exc = NULL;
6260 /* the following variable is used for caching string comparisons
6261 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6262 int known_errorHandler = -1;
6263
6264 /* allocate enough for a simple encoding without
6265 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006266 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006267 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006268 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006270 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006271 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 ressize = size;
6273
6274 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 /* can we encode this? */
6278 if (c<limit) {
6279 /* no overflow check, because we know that the space is enough */
6280 *str++ = (char)c;
6281 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006282 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 else {
6284 Py_ssize_t unicodepos = p-startp;
6285 Py_ssize_t requiredsize;
6286 PyObject *repunicode;
6287 Py_ssize_t repsize;
6288 Py_ssize_t newpos;
6289 Py_ssize_t respos;
6290 Py_UNICODE *uni2;
6291 /* startpos for collecting unencodable chars */
6292 const Py_UNICODE *collstart = p;
6293 const Py_UNICODE *collend = p;
6294 /* find all unecodable characters */
6295 while ((collend < endp) && ((*collend)>=limit))
6296 ++collend;
6297 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6298 if (known_errorHandler==-1) {
6299 if ((errors==NULL) || (!strcmp(errors, "strict")))
6300 known_errorHandler = 1;
6301 else if (!strcmp(errors, "replace"))
6302 known_errorHandler = 2;
6303 else if (!strcmp(errors, "ignore"))
6304 known_errorHandler = 3;
6305 else if (!strcmp(errors, "xmlcharrefreplace"))
6306 known_errorHandler = 4;
6307 else
6308 known_errorHandler = 0;
6309 }
6310 switch (known_errorHandler) {
6311 case 1: /* strict */
6312 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6313 goto onError;
6314 case 2: /* replace */
6315 while (collstart++<collend)
6316 *str++ = '?'; /* fall through */
6317 case 3: /* ignore */
6318 p = collend;
6319 break;
6320 case 4: /* xmlcharrefreplace */
6321 respos = str - PyBytes_AS_STRING(res);
6322 /* determine replacement size (temporarily (mis)uses p) */
6323 for (p = collstart, repsize = 0; p < collend; ++p) {
6324 if (*p<10)
6325 repsize += 2+1+1;
6326 else if (*p<100)
6327 repsize += 2+2+1;
6328 else if (*p<1000)
6329 repsize += 2+3+1;
6330 else if (*p<10000)
6331 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006332#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 else
6334 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006335#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 else if (*p<100000)
6337 repsize += 2+5+1;
6338 else if (*p<1000000)
6339 repsize += 2+6+1;
6340 else
6341 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006342#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 }
6344 requiredsize = respos+repsize+(endp-collend);
6345 if (requiredsize > ressize) {
6346 if (requiredsize<2*ressize)
6347 requiredsize = 2*ressize;
6348 if (_PyBytes_Resize(&res, requiredsize))
6349 goto onError;
6350 str = PyBytes_AS_STRING(res) + respos;
6351 ressize = requiredsize;
6352 }
6353 /* generate replacement (temporarily (mis)uses p) */
6354 for (p = collstart; p < collend; ++p) {
6355 str += sprintf(str, "&#%d;", (int)*p);
6356 }
6357 p = collend;
6358 break;
6359 default:
6360 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6361 encoding, reason, startp, size, &exc,
6362 collstart-startp, collend-startp, &newpos);
6363 if (repunicode == NULL)
6364 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006365 if (PyBytes_Check(repunicode)) {
6366 /* Directly copy bytes result to output. */
6367 repsize = PyBytes_Size(repunicode);
6368 if (repsize > 1) {
6369 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006370 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006371 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6372 Py_DECREF(repunicode);
6373 goto onError;
6374 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006375 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006376 ressize += repsize-1;
6377 }
6378 memcpy(str, PyBytes_AsString(repunicode), repsize);
6379 str += repsize;
6380 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006381 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006382 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006383 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 /* need more space? (at least enough for what we
6385 have+the replacement+the rest of the string, so
6386 we won't have to check space for encodable characters) */
6387 respos = str - PyBytes_AS_STRING(res);
6388 repsize = PyUnicode_GET_SIZE(repunicode);
6389 requiredsize = respos+repsize+(endp-collend);
6390 if (requiredsize > ressize) {
6391 if (requiredsize<2*ressize)
6392 requiredsize = 2*ressize;
6393 if (_PyBytes_Resize(&res, requiredsize)) {
6394 Py_DECREF(repunicode);
6395 goto onError;
6396 }
6397 str = PyBytes_AS_STRING(res) + respos;
6398 ressize = requiredsize;
6399 }
6400 /* check if there is anything unencodable in the replacement
6401 and copy it to the output */
6402 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6403 c = *uni2;
6404 if (c >= limit) {
6405 raise_encode_exception(&exc, encoding, startp, size,
6406 unicodepos, unicodepos+1, reason);
6407 Py_DECREF(repunicode);
6408 goto onError;
6409 }
6410 *str = (char)c;
6411 }
6412 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006413 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006415 }
6416 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006417 /* Resize if we allocated to much */
6418 size = str - PyBytes_AS_STRING(res);
6419 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006420 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006421 if (_PyBytes_Resize(&res, size) < 0)
6422 goto onError;
6423 }
6424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 Py_XDECREF(errorHandler);
6426 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006427 return res;
6428
6429 onError:
6430 Py_XDECREF(res);
6431 Py_XDECREF(errorHandler);
6432 Py_XDECREF(exc);
6433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434}
6435
Alexander Belopolsky40018472011-02-26 01:02:56 +00006436PyObject *
6437PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006438 Py_ssize_t size,
6439 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442}
6443
Alexander Belopolsky40018472011-02-26 01:02:56 +00006444PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006445_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
6447 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 PyErr_BadArgument();
6449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006451 if (PyUnicode_READY(unicode) == -1)
6452 return NULL;
6453 /* Fast path: if it is a one-byte string, construct
6454 bytes object directly. */
6455 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6456 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6457 PyUnicode_GET_LENGTH(unicode));
6458 /* Non-Latin-1 characters present. Defer to above function to
6459 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006462 errors);
6463}
6464
6465PyObject*
6466PyUnicode_AsLatin1String(PyObject *unicode)
6467{
6468 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469}
6470
6471/* --- 7-bit ASCII Codec -------------------------------------------------- */
6472
Alexander Belopolsky40018472011-02-26 01:02:56 +00006473PyObject *
6474PyUnicode_DecodeASCII(const char *s,
6475 Py_ssize_t size,
6476 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 PyUnicodeObject *v;
6480 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006481 Py_ssize_t startinpos;
6482 Py_ssize_t endinpos;
6483 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006485 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486 PyObject *errorHandler = NULL;
6487 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006488 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006491 if (size == 1 && *(unsigned char*)s < 128)
6492 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6493
6494 /* Fast path. Assume the input actually *is* ASCII, and allocate
6495 a single-block Unicode object with that assumption. If there is
6496 an error, drop the object and start over. */
6497 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6498 if (v == NULL)
6499 goto onError;
6500 d = PyUnicode_1BYTE_DATA(v);
6501 for (i = 0; i < size; i++) {
6502 unsigned char ch = ((unsigned char*)s)[i];
6503 if (ch < 128)
6504 d[i] = ch;
6505 else
6506 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006508 if (i == size)
6509 return (PyObject*)v;
6510 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 v = _PyUnicode_New(size);
6513 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 e = s + size;
6519 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 register unsigned char c = (unsigned char)*s;
6521 if (c < 128) {
6522 *p++ = c;
6523 ++s;
6524 }
6525 else {
6526 startinpos = s-starts;
6527 endinpos = startinpos + 1;
6528 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6529 if (unicode_decode_call_errorhandler(
6530 errors, &errorHandler,
6531 "ascii", "ordinal not in range(128)",
6532 &starts, &e, &startinpos, &endinpos, &exc, &s,
6533 &v, &outpos, &p))
6534 goto onError;
6535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006537 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006538 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 Py_XDECREF(errorHandler);
6541 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006542#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006543 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544 Py_DECREF(v);
6545 return NULL;
6546 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006547#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006549
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return NULL;
6555}
6556
Alexander Belopolsky40018472011-02-26 01:02:56 +00006557PyObject *
6558PyUnicode_EncodeASCII(const Py_UNICODE *p,
6559 Py_ssize_t size,
6560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006566_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
6568 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 PyErr_BadArgument();
6570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006572 if (PyUnicode_READY(unicode) == -1)
6573 return NULL;
6574 /* Fast path: if it is an ASCII-only string, construct bytes object
6575 directly. Else defer to above function to raise the exception. */
6576 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6577 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6578 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006581 errors);
6582}
6583
6584PyObject *
6585PyUnicode_AsASCIIString(PyObject *unicode)
6586{
6587 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588}
6589
Victor Stinner99b95382011-07-04 14:23:54 +02006590#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006591
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006592/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006593
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006594#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006595#define NEED_RETRY
6596#endif
6597
6598/* XXX This code is limited to "true" double-byte encodings, as
6599 a) it assumes an incomplete character consists of a single byte, and
6600 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603static int
6604is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006605{
6606 const char *curr = s + offset;
6607
6608 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 const char *prev = CharPrev(s, curr);
6610 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006611 }
6612 return 0;
6613}
6614
6615/*
6616 * Decode MBCS string into unicode object. If 'final' is set, converts
6617 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6618 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619static int
6620decode_mbcs(PyUnicodeObject **v,
6621 const char *s, /* MBCS string */
6622 int size, /* sizeof MBCS string */
6623 int final,
6624 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006625{
6626 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006627 Py_ssize_t n;
6628 DWORD usize;
6629 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006630
6631 assert(size >= 0);
6632
Victor Stinner554f3f02010-06-16 23:33:54 +00006633 /* check and handle 'errors' arg */
6634 if (errors==NULL || strcmp(errors, "strict")==0)
6635 flags = MB_ERR_INVALID_CHARS;
6636 else if (strcmp(errors, "ignore")==0)
6637 flags = 0;
6638 else {
6639 PyErr_Format(PyExc_ValueError,
6640 "mbcs encoding does not support errors='%s'",
6641 errors);
6642 return -1;
6643 }
6644
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006645 /* Skip trailing lead-byte unless 'final' is set */
6646 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006648
6649 /* First get the size of the result */
6650 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006651 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6652 if (usize==0)
6653 goto mbcs_decode_error;
6654 } else
6655 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656
6657 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 /* Create unicode object */
6659 *v = _PyUnicode_New(usize);
6660 if (*v == NULL)
6661 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006662 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663 }
6664 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 /* Extend unicode object */
6666 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006667 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006669 }
6670
6671 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006672 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006674 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6675 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006679
6680mbcs_decode_error:
6681 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6682 we raise a UnicodeDecodeError - else it is a 'generic'
6683 windows error
6684 */
6685 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6686 /* Ideally, we should get reason from FormatMessage - this
6687 is the Windows 2000 English version of the message
6688 */
6689 PyObject *exc = NULL;
6690 const char *reason = "No mapping for the Unicode character exists "
6691 "in the target multi-byte code page.";
6692 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6693 if (exc != NULL) {
6694 PyCodec_StrictErrors(exc);
6695 Py_DECREF(exc);
6696 }
6697 } else {
6698 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6699 }
6700 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701}
6702
Alexander Belopolsky40018472011-02-26 01:02:56 +00006703PyObject *
6704PyUnicode_DecodeMBCSStateful(const char *s,
6705 Py_ssize_t size,
6706 const char *errors,
6707 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708{
6709 PyUnicodeObject *v = NULL;
6710 int done;
6711
6712 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714
6715#ifdef NEED_RETRY
6716 retry:
6717 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006718 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006719 else
6720#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006721 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722
6723 if (done < 0) {
6724 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 }
6727
6728 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730
6731#ifdef NEED_RETRY
6732 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 s += done;
6734 size -= done;
6735 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736 }
6737#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006738#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006739 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 Py_DECREF(v);
6741 return NULL;
6742 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006743#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006744 return (PyObject *)v;
6745}
6746
Alexander Belopolsky40018472011-02-26 01:02:56 +00006747PyObject *
6748PyUnicode_DecodeMBCS(const char *s,
6749 Py_ssize_t size,
6750 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006751{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6753}
6754
6755/*
6756 * Convert unicode into string object (MBCS).
6757 * Returns 0 if succeed, -1 otherwise.
6758 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759static int
6760encode_mbcs(PyObject **repr,
6761 const Py_UNICODE *p, /* unicode */
6762 int size, /* size of unicode */
6763 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764{
Victor Stinner554f3f02010-06-16 23:33:54 +00006765 BOOL usedDefaultChar = FALSE;
6766 BOOL *pusedDefaultChar;
6767 int mbcssize;
6768 Py_ssize_t n;
6769 PyObject *exc = NULL;
6770 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
6772 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006773
Victor Stinner554f3f02010-06-16 23:33:54 +00006774 /* check and handle 'errors' arg */
6775 if (errors==NULL || strcmp(errors, "strict")==0) {
6776 flags = WC_NO_BEST_FIT_CHARS;
6777 pusedDefaultChar = &usedDefaultChar;
6778 } else if (strcmp(errors, "replace")==0) {
6779 flags = 0;
6780 pusedDefaultChar = NULL;
6781 } else {
6782 PyErr_Format(PyExc_ValueError,
6783 "mbcs encoding does not support errors='%s'",
6784 errors);
6785 return -1;
6786 }
6787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006788 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006789 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006790 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6791 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 if (mbcssize == 0) {
6793 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6794 return -1;
6795 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006796 /* If we used a default char, then we failed! */
6797 if (pusedDefaultChar && *pusedDefaultChar)
6798 goto mbcs_encode_error;
6799 } else {
6800 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006801 }
6802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 /* Create string object */
6805 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6806 if (*repr == NULL)
6807 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006808 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809 }
6810 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 /* Extend string object */
6812 n = PyBytes_Size(*repr);
6813 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6814 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815 }
6816
6817 /* Do the conversion */
6818 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006820 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6821 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6823 return -1;
6824 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006825 if (pusedDefaultChar && *pusedDefaultChar)
6826 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006829
6830mbcs_encode_error:
6831 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6832 Py_XDECREF(exc);
6833 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006834}
6835
Alexander Belopolsky40018472011-02-26 01:02:56 +00006836PyObject *
6837PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6838 Py_ssize_t size,
6839 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006840{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 PyObject *repr = NULL;
6842 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006843
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006846 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006847 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848 else
6849#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006850 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006851
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 Py_XDECREF(repr);
6854 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006855 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856
6857#ifdef NEED_RETRY
6858 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 p += INT_MAX;
6860 size -= INT_MAX;
6861 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862 }
6863#endif
6864
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006865 return repr;
6866}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006867
Alexander Belopolsky40018472011-02-26 01:02:56 +00006868PyObject *
6869PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006870{
6871 if (!PyUnicode_Check(unicode)) {
6872 PyErr_BadArgument();
6873 return NULL;
6874 }
6875 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 PyUnicode_GET_SIZE(unicode),
6877 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006878}
6879
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880#undef NEED_RETRY
6881
Victor Stinner99b95382011-07-04 14:23:54 +02006882#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006883
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884/* --- Character Mapping Codec -------------------------------------------- */
6885
Alexander Belopolsky40018472011-02-26 01:02:56 +00006886PyObject *
6887PyUnicode_DecodeCharmap(const char *s,
6888 Py_ssize_t size,
6889 PyObject *mapping,
6890 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t startinpos;
6894 Py_ssize_t endinpos;
6895 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 PyUnicodeObject *v;
6898 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 PyObject *errorHandler = NULL;
6901 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006902 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 /* Default to Latin-1 */
6906 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909 v = _PyUnicode_New(size);
6910 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006916 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 mapstring = PyUnicode_AS_UNICODE(mapping);
6918 maplen = PyUnicode_GET_SIZE(mapping);
6919 while (s < e) {
6920 unsigned char ch = *s;
6921 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 if (ch < maplen)
6924 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 if (x == 0xfffe) {
6927 /* undefined mapping */
6928 outpos = p-PyUnicode_AS_UNICODE(v);
6929 startinpos = s-starts;
6930 endinpos = startinpos+1;
6931 if (unicode_decode_call_errorhandler(
6932 errors, &errorHandler,
6933 "charmap", "character maps to <undefined>",
6934 &starts, &e, &startinpos, &endinpos, &exc, &s,
6935 &v, &outpos, &p)) {
6936 goto onError;
6937 }
6938 continue;
6939 }
6940 *p++ = x;
6941 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006942 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006943 }
6944 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 while (s < e) {
6946 unsigned char ch = *s;
6947 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006948
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6950 w = PyLong_FromLong((long)ch);
6951 if (w == NULL)
6952 goto onError;
6953 x = PyObject_GetItem(mapping, w);
6954 Py_DECREF(w);
6955 if (x == NULL) {
6956 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6957 /* No mapping found means: mapping is undefined. */
6958 PyErr_Clear();
6959 x = Py_None;
6960 Py_INCREF(x);
6961 } else
6962 goto onError;
6963 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* Apply mapping */
6966 if (PyLong_Check(x)) {
6967 long value = PyLong_AS_LONG(x);
6968 if (value < 0 || value > 65535) {
6969 PyErr_SetString(PyExc_TypeError,
6970 "character mapping must be in range(65536)");
6971 Py_DECREF(x);
6972 goto onError;
6973 }
6974 *p++ = (Py_UNICODE)value;
6975 }
6976 else if (x == Py_None) {
6977 /* undefined mapping */
6978 outpos = p-PyUnicode_AS_UNICODE(v);
6979 startinpos = s-starts;
6980 endinpos = startinpos+1;
6981 if (unicode_decode_call_errorhandler(
6982 errors, &errorHandler,
6983 "charmap", "character maps to <undefined>",
6984 &starts, &e, &startinpos, &endinpos, &exc, &s,
6985 &v, &outpos, &p)) {
6986 Py_DECREF(x);
6987 goto onError;
6988 }
6989 Py_DECREF(x);
6990 continue;
6991 }
6992 else if (PyUnicode_Check(x)) {
6993 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 if (targetsize == 1)
6996 /* 1-1 mapping */
6997 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006998
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 else if (targetsize > 1) {
7000 /* 1-n mapping */
7001 if (targetsize > extrachars) {
7002 /* resize first */
7003 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7004 Py_ssize_t needed = (targetsize - extrachars) + \
7005 (targetsize << 2);
7006 extrachars += needed;
7007 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007008 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 PyUnicode_GET_SIZE(v) + needed) < 0) {
7010 Py_DECREF(x);
7011 goto onError;
7012 }
7013 p = PyUnicode_AS_UNICODE(v) + oldpos;
7014 }
7015 Py_UNICODE_COPY(p,
7016 PyUnicode_AS_UNICODE(x),
7017 targetsize);
7018 p += targetsize;
7019 extrachars -= targetsize;
7020 }
7021 /* 1-0 mapping: skip the character */
7022 }
7023 else {
7024 /* wrong return value */
7025 PyErr_SetString(PyExc_TypeError,
7026 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027 Py_DECREF(x);
7028 goto onError;
7029 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 Py_DECREF(x);
7031 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 }
7034 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007035 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007037 Py_XDECREF(errorHandler);
7038 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007039#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007040 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041 Py_DECREF(v);
7042 return NULL;
7043 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007044#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007046
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 Py_XDECREF(errorHandler);
7049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 Py_XDECREF(v);
7051 return NULL;
7052}
7053
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007054/* Charmap encoding: the lookup table */
7055
Alexander Belopolsky40018472011-02-26 01:02:56 +00007056struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 PyObject_HEAD
7058 unsigned char level1[32];
7059 int count2, count3;
7060 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007061};
7062
7063static PyObject*
7064encoding_map_size(PyObject *obj, PyObject* args)
7065{
7066 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007067 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007069}
7070
7071static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 PyDoc_STR("Return the size (in bytes) of this object") },
7074 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007075};
7076
7077static void
7078encoding_map_dealloc(PyObject* o)
7079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007080 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007081}
7082
7083static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 "EncodingMap", /*tp_name*/
7086 sizeof(struct encoding_map), /*tp_basicsize*/
7087 0, /*tp_itemsize*/
7088 /* methods */
7089 encoding_map_dealloc, /*tp_dealloc*/
7090 0, /*tp_print*/
7091 0, /*tp_getattr*/
7092 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007093 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 0, /*tp_repr*/
7095 0, /*tp_as_number*/
7096 0, /*tp_as_sequence*/
7097 0, /*tp_as_mapping*/
7098 0, /*tp_hash*/
7099 0, /*tp_call*/
7100 0, /*tp_str*/
7101 0, /*tp_getattro*/
7102 0, /*tp_setattro*/
7103 0, /*tp_as_buffer*/
7104 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7105 0, /*tp_doc*/
7106 0, /*tp_traverse*/
7107 0, /*tp_clear*/
7108 0, /*tp_richcompare*/
7109 0, /*tp_weaklistoffset*/
7110 0, /*tp_iter*/
7111 0, /*tp_iternext*/
7112 encoding_map_methods, /*tp_methods*/
7113 0, /*tp_members*/
7114 0, /*tp_getset*/
7115 0, /*tp_base*/
7116 0, /*tp_dict*/
7117 0, /*tp_descr_get*/
7118 0, /*tp_descr_set*/
7119 0, /*tp_dictoffset*/
7120 0, /*tp_init*/
7121 0, /*tp_alloc*/
7122 0, /*tp_new*/
7123 0, /*tp_free*/
7124 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007125};
7126
7127PyObject*
7128PyUnicode_BuildEncodingMap(PyObject* string)
7129{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007130 PyObject *result;
7131 struct encoding_map *mresult;
7132 int i;
7133 int need_dict = 0;
7134 unsigned char level1[32];
7135 unsigned char level2[512];
7136 unsigned char *mlevel1, *mlevel2, *mlevel3;
7137 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007138 int kind;
7139 void *data;
7140 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007143 PyErr_BadArgument();
7144 return NULL;
7145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007146 kind = PyUnicode_KIND(string);
7147 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007148 memset(level1, 0xFF, sizeof level1);
7149 memset(level2, 0xFF, sizeof level2);
7150
7151 /* If there isn't a one-to-one mapping of NULL to \0,
7152 or if there are non-BMP characters, we need to use
7153 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007154 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007155 need_dict = 1;
7156 for (i = 1; i < 256; i++) {
7157 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007158 ch = PyUnicode_READ(kind, data, i);
7159 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007160 need_dict = 1;
7161 break;
7162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007163 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007164 /* unmapped character */
7165 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166 l1 = ch >> 11;
7167 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168 if (level1[l1] == 0xFF)
7169 level1[l1] = count2++;
7170 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007171 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007172 }
7173
7174 if (count2 >= 0xFF || count3 >= 0xFF)
7175 need_dict = 1;
7176
7177 if (need_dict) {
7178 PyObject *result = PyDict_New();
7179 PyObject *key, *value;
7180 if (!result)
7181 return NULL;
7182 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007183 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007184 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007185 if (!key || !value)
7186 goto failed1;
7187 if (PyDict_SetItem(result, key, value) == -1)
7188 goto failed1;
7189 Py_DECREF(key);
7190 Py_DECREF(value);
7191 }
7192 return result;
7193 failed1:
7194 Py_XDECREF(key);
7195 Py_XDECREF(value);
7196 Py_DECREF(result);
7197 return NULL;
7198 }
7199
7200 /* Create a three-level trie */
7201 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7202 16*count2 + 128*count3 - 1);
7203 if (!result)
7204 return PyErr_NoMemory();
7205 PyObject_Init(result, &EncodingMapType);
7206 mresult = (struct encoding_map*)result;
7207 mresult->count2 = count2;
7208 mresult->count3 = count3;
7209 mlevel1 = mresult->level1;
7210 mlevel2 = mresult->level23;
7211 mlevel3 = mresult->level23 + 16*count2;
7212 memcpy(mlevel1, level1, 32);
7213 memset(mlevel2, 0xFF, 16*count2);
7214 memset(mlevel3, 0, 128*count3);
7215 count3 = 0;
7216 for (i = 1; i < 256; i++) {
7217 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007218 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007219 /* unmapped character */
7220 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221 o1 = PyUnicode_READ(kind, data, i)>>11;
7222 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223 i2 = 16*mlevel1[o1] + o2;
7224 if (mlevel2[i2] == 0xFF)
7225 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007227 i3 = 128*mlevel2[i2] + o3;
7228 mlevel3[i3] = i;
7229 }
7230 return result;
7231}
7232
7233static int
7234encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7235{
7236 struct encoding_map *map = (struct encoding_map*)mapping;
7237 int l1 = c>>11;
7238 int l2 = (c>>7) & 0xF;
7239 int l3 = c & 0x7F;
7240 int i;
7241
7242#ifdef Py_UNICODE_WIDE
7243 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007245 }
7246#endif
7247 if (c == 0)
7248 return 0;
7249 /* level 1*/
7250 i = map->level1[l1];
7251 if (i == 0xFF) {
7252 return -1;
7253 }
7254 /* level 2*/
7255 i = map->level23[16*i+l2];
7256 if (i == 0xFF) {
7257 return -1;
7258 }
7259 /* level 3 */
7260 i = map->level23[16*map->count2 + 128*i + l3];
7261 if (i == 0) {
7262 return -1;
7263 }
7264 return i;
7265}
7266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267/* Lookup the character ch in the mapping. If the character
7268 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007269 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007270static PyObject *
7271charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272{
Christian Heimes217cfd12007-12-02 14:31:20 +00007273 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 PyObject *x;
7275
7276 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 x = PyObject_GetItem(mapping, w);
7279 Py_DECREF(w);
7280 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7282 /* No mapping found means: mapping is undefined. */
7283 PyErr_Clear();
7284 x = Py_None;
7285 Py_INCREF(x);
7286 return x;
7287 } else
7288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007290 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007292 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 long value = PyLong_AS_LONG(x);
7294 if (value < 0 || value > 255) {
7295 PyErr_SetString(PyExc_TypeError,
7296 "character mapping must be in range(256)");
7297 Py_DECREF(x);
7298 return NULL;
7299 }
7300 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007302 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 /* wrong return value */
7306 PyErr_Format(PyExc_TypeError,
7307 "character mapping must return integer, bytes or None, not %.400s",
7308 x->ob_type->tp_name);
7309 Py_DECREF(x);
7310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 }
7312}
7313
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007314static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007315charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007317 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7318 /* exponentially overallocate to minimize reallocations */
7319 if (requiredsize < 2*outsize)
7320 requiredsize = 2*outsize;
7321 if (_PyBytes_Resize(outobj, requiredsize))
7322 return -1;
7323 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007324}
7325
Benjamin Peterson14339b62009-01-31 16:36:08 +00007326typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007330 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007331 space is available. Return a new reference to the object that
7332 was put in the output buffer, or Py_None, if the mapping was undefined
7333 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007334 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335static charmapencode_result
7336charmapencode_output(Py_UNICODE c, PyObject *mapping,
7337 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007339 PyObject *rep;
7340 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007341 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342
Christian Heimes90aa7642007-12-19 02:45:37 +00007343 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007344 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007346 if (res == -1)
7347 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 if (outsize<requiredsize)
7349 if (charmapencode_resize(outobj, outpos, requiredsize))
7350 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007351 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 outstart[(*outpos)++] = (char)res;
7353 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007354 }
7355
7356 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007359 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 Py_DECREF(rep);
7361 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007362 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 if (PyLong_Check(rep)) {
7364 Py_ssize_t requiredsize = *outpos+1;
7365 if (outsize<requiredsize)
7366 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7367 Py_DECREF(rep);
7368 return enc_EXCEPTION;
7369 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007370 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007372 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 else {
7374 const char *repchars = PyBytes_AS_STRING(rep);
7375 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7376 Py_ssize_t requiredsize = *outpos+repsize;
7377 if (outsize<requiredsize)
7378 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7379 Py_DECREF(rep);
7380 return enc_EXCEPTION;
7381 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007382 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 memcpy(outstart + *outpos, repchars, repsize);
7384 *outpos += repsize;
7385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007386 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007387 Py_DECREF(rep);
7388 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389}
7390
7391/* handle an error in PyUnicode_EncodeCharmap
7392 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007393static int
7394charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007397 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007398 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007399{
7400 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007401 Py_ssize_t repsize;
7402 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 Py_UNICODE *uni2;
7404 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007405 Py_ssize_t collstartpos = *inpos;
7406 Py_ssize_t collendpos = *inpos+1;
7407 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007408 char *encoding = "charmap";
7409 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 /* find all unencodable characters */
7413 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007414 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007415 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 int res = encoding_map_lookup(p[collendpos], mapping);
7417 if (res != -1)
7418 break;
7419 ++collendpos;
7420 continue;
7421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 rep = charmapencode_lookup(p[collendpos], mapping);
7424 if (rep==NULL)
7425 return -1;
7426 else if (rep!=Py_None) {
7427 Py_DECREF(rep);
7428 break;
7429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007430 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007432 }
7433 /* cache callback name lookup
7434 * (if not done yet, i.e. it's the first error) */
7435 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 if ((errors==NULL) || (!strcmp(errors, "strict")))
7437 *known_errorHandler = 1;
7438 else if (!strcmp(errors, "replace"))
7439 *known_errorHandler = 2;
7440 else if (!strcmp(errors, "ignore"))
7441 *known_errorHandler = 3;
7442 else if (!strcmp(errors, "xmlcharrefreplace"))
7443 *known_errorHandler = 4;
7444 else
7445 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 }
7447 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007448 case 1: /* strict */
7449 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7450 return -1;
7451 case 2: /* replace */
7452 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 x = charmapencode_output('?', mapping, res, respos);
7454 if (x==enc_EXCEPTION) {
7455 return -1;
7456 }
7457 else if (x==enc_FAILED) {
7458 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7459 return -1;
7460 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007461 }
7462 /* fall through */
7463 case 3: /* ignore */
7464 *inpos = collendpos;
7465 break;
7466 case 4: /* xmlcharrefreplace */
7467 /* generate replacement (temporarily (mis)uses p) */
7468 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 char buffer[2+29+1+1];
7470 char *cp;
7471 sprintf(buffer, "&#%d;", (int)p[collpos]);
7472 for (cp = buffer; *cp; ++cp) {
7473 x = charmapencode_output(*cp, mapping, res, respos);
7474 if (x==enc_EXCEPTION)
7475 return -1;
7476 else if (x==enc_FAILED) {
7477 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7478 return -1;
7479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 }
7481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 *inpos = collendpos;
7483 break;
7484 default:
7485 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 encoding, reason, p, size, exceptionObject,
7487 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007488 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007490 if (PyBytes_Check(repunicode)) {
7491 /* Directly copy bytes result to output. */
7492 Py_ssize_t outsize = PyBytes_Size(*res);
7493 Py_ssize_t requiredsize;
7494 repsize = PyBytes_Size(repunicode);
7495 requiredsize = *respos + repsize;
7496 if (requiredsize > outsize)
7497 /* Make room for all additional bytes. */
7498 if (charmapencode_resize(res, respos, requiredsize)) {
7499 Py_DECREF(repunicode);
7500 return -1;
7501 }
7502 memcpy(PyBytes_AsString(*res) + *respos,
7503 PyBytes_AsString(repunicode), repsize);
7504 *respos += repsize;
7505 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007506 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007507 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007509 /* generate replacement */
7510 repsize = PyUnicode_GET_SIZE(repunicode);
7511 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 x = charmapencode_output(*uni2, mapping, res, respos);
7513 if (x==enc_EXCEPTION) {
7514 return -1;
7515 }
7516 else if (x==enc_FAILED) {
7517 Py_DECREF(repunicode);
7518 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7519 return -1;
7520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007521 }
7522 *inpos = newpos;
7523 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524 }
7525 return 0;
7526}
7527
Alexander Belopolsky40018472011-02-26 01:02:56 +00007528PyObject *
7529PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7530 Py_ssize_t size,
7531 PyObject *mapping,
7532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 /* output object */
7535 PyObject *res = NULL;
7536 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007537 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007538 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007539 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 PyObject *errorHandler = NULL;
7541 PyObject *exc = NULL;
7542 /* the following variable is used for caching string comparisons
7543 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7544 * 3=ignore, 4=xmlcharrefreplace */
7545 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547 /* Default to Latin-1 */
7548 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 /* allocate enough for a simple encoding without
7552 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007553 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554 if (res == NULL)
7555 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007556 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 /* try to encode it */
7561 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7562 if (x==enc_EXCEPTION) /* error */
7563 goto onError;
7564 if (x==enc_FAILED) { /* unencodable character */
7565 if (charmap_encoding_error(p, size, &inpos, mapping,
7566 &exc,
7567 &known_errorHandler, &errorHandler, errors,
7568 &res, &respos)) {
7569 goto onError;
7570 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 else
7573 /* done with this character => adjust input position */
7574 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007578 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007579 if (_PyBytes_Resize(&res, respos) < 0)
7580 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 Py_XDECREF(exc);
7583 Py_XDECREF(errorHandler);
7584 return res;
7585
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 Py_XDECREF(res);
7588 Py_XDECREF(exc);
7589 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 return NULL;
7591}
7592
Alexander Belopolsky40018472011-02-26 01:02:56 +00007593PyObject *
7594PyUnicode_AsCharmapString(PyObject *unicode,
7595 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596{
7597 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 PyErr_BadArgument();
7599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 }
7601 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 PyUnicode_GET_SIZE(unicode),
7603 mapping,
7604 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007607/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007608static void
7609make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007610 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007611 Py_ssize_t startpos, Py_ssize_t endpos,
7612 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007615 *exceptionObject = _PyUnicodeTranslateError_Create(
7616 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 }
7618 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7620 goto onError;
7621 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7622 goto onError;
7623 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7624 goto onError;
7625 return;
7626 onError:
7627 Py_DECREF(*exceptionObject);
7628 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 }
7630}
7631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007633static void
7634raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007635 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007636 Py_ssize_t startpos, Py_ssize_t endpos,
7637 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007638{
7639 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007640 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643}
7644
7645/* error handling callback helper:
7646 build arguments, call the callback and check the arguments,
7647 put the result into newpos and return the replacement string, which
7648 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007649static PyObject *
7650unicode_translate_call_errorhandler(const char *errors,
7651 PyObject **errorHandler,
7652 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007653 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007654 Py_ssize_t startpos, Py_ssize_t endpos,
7655 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007657 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007659 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007660 PyObject *restuple;
7661 PyObject *resunicode;
7662
7663 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 }
7668
7669 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007670 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673
7674 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007679 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 Py_DECREF(restuple);
7681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007682 }
7683 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 &resunicode, &i_newpos)) {
7685 Py_DECREF(restuple);
7686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007687 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007688 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007689 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007690 else
7691 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007692 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7694 Py_DECREF(restuple);
7695 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 Py_INCREF(resunicode);
7698 Py_DECREF(restuple);
7699 return resunicode;
7700}
7701
7702/* Lookup the character ch in the mapping and put the result in result,
7703 which must be decrefed by the caller.
7704 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007705static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007706charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707{
Christian Heimes217cfd12007-12-02 14:31:20 +00007708 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 PyObject *x;
7710
7711 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 x = PyObject_GetItem(mapping, w);
7714 Py_DECREF(w);
7715 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7717 /* No mapping found means: use 1:1 mapping. */
7718 PyErr_Clear();
7719 *result = NULL;
7720 return 0;
7721 } else
7722 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 }
7724 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 *result = x;
7726 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007728 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 long value = PyLong_AS_LONG(x);
7730 long max = PyUnicode_GetMax();
7731 if (value < 0 || value > max) {
7732 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007733 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 Py_DECREF(x);
7735 return -1;
7736 }
7737 *result = x;
7738 return 0;
7739 }
7740 else if (PyUnicode_Check(x)) {
7741 *result = x;
7742 return 0;
7743 }
7744 else {
7745 /* wrong return value */
7746 PyErr_SetString(PyExc_TypeError,
7747 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007748 Py_DECREF(x);
7749 return -1;
7750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007751}
7752/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 if not reallocate and adjust various state variables.
7754 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007755static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007756charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007759 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007760 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 /* exponentially overallocate to minimize reallocations */
7762 if (requiredsize < 2 * oldsize)
7763 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7765 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007767 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007768 }
7769 return 0;
7770}
7771/* lookup the character, put the result in the output string and adjust
7772 various state variables. Return a new reference to the object that
7773 was put in the output buffer in *result, or Py_None, if the mapping was
7774 undefined (in which case no character was written).
7775 The called must decref result.
7776 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007778charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7779 PyObject *mapping, Py_UCS4 **output,
7780 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007783 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7784 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 }
7790 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007792 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 }
7796 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 Py_ssize_t repsize;
7798 if (PyUnicode_READY(*res) == -1)
7799 return -1;
7800 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 if (repsize==1) {
7802 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 }
7805 else if (repsize!=0) {
7806 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 Py_ssize_t requiredsize = *opos +
7808 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 Py_ssize_t i;
7811 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813 for(i = 0; i < repsize; i++)
7814 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 }
7817 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 return 0;
7820}
7821
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823_PyUnicode_TranslateCharmap(PyObject *input,
7824 PyObject *mapping,
7825 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 /* input object */
7828 char *idata;
7829 Py_ssize_t size, i;
7830 int kind;
7831 /* output buffer */
7832 Py_UCS4 *output = NULL;
7833 Py_ssize_t osize;
7834 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 char *reason = "character maps to <undefined>";
7838 PyObject *errorHandler = NULL;
7839 PyObject *exc = NULL;
7840 /* the following variable is used for caching string comparisons
7841 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7842 * 3=ignore, 4=xmlcharrefreplace */
7843 int known_errorHandler = -1;
7844
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 PyErr_BadArgument();
7847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 if (PyUnicode_READY(input) == -1)
7851 return NULL;
7852 idata = (char*)PyUnicode_DATA(input);
7853 kind = PyUnicode_KIND(input);
7854 size = PyUnicode_GET_LENGTH(input);
7855 i = 0;
7856
7857 if (size == 0) {
7858 Py_INCREF(input);
7859 return input;
7860 }
7861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 /* allocate enough for a simple 1:1 translation without
7863 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 osize = size;
7865 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7866 opos = 0;
7867 if (output == NULL) {
7868 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 /* try to encode it */
7874 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007875 if (charmaptranslate_output(input, i, mapping,
7876 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 Py_XDECREF(x);
7878 goto onError;
7879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 else { /* untranslatable character */
7884 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7885 Py_ssize_t repsize;
7886 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 Py_ssize_t collstart = i;
7890 Py_ssize_t collend = i+1;
7891 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007894 while (collend < size) {
7895 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 goto onError;
7897 Py_XDECREF(x);
7898 if (x!=Py_None)
7899 break;
7900 ++collend;
7901 }
7902 /* cache callback name lookup
7903 * (if not done yet, i.e. it's the first error) */
7904 if (known_errorHandler==-1) {
7905 if ((errors==NULL) || (!strcmp(errors, "strict")))
7906 known_errorHandler = 1;
7907 else if (!strcmp(errors, "replace"))
7908 known_errorHandler = 2;
7909 else if (!strcmp(errors, "ignore"))
7910 known_errorHandler = 3;
7911 else if (!strcmp(errors, "xmlcharrefreplace"))
7912 known_errorHandler = 4;
7913 else
7914 known_errorHandler = 0;
7915 }
7916 switch (known_errorHandler) {
7917 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 raise_translate_exception(&exc, input, collstart,
7919 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 case 2: /* replace */
7922 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 for (coll = collstart; coll<collend; coll++)
7924 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 /* fall through */
7926 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 break;
7929 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 /* generate replacement (temporarily (mis)uses i) */
7931 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 char buffer[2+29+1+1];
7933 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7935 if (charmaptranslate_makespace(&output, &osize,
7936 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 goto onError;
7938 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007941 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 break;
7943 default:
7944 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007945 reason, input, &exc,
7946 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007947 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 goto onError;
7949 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007950 repsize = PyUnicode_GET_LENGTH(repunicode);
7951 if (charmaptranslate_makespace(&output, &osize,
7952 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 Py_DECREF(repunicode);
7954 goto onError;
7955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 for (uni2 = 0; repsize-->0; ++uni2)
7957 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7958 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007960 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007961 }
7962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7964 if (!res)
7965 goto onError;
7966 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967 Py_XDECREF(exc);
7968 Py_XDECREF(errorHandler);
7969 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 Py_XDECREF(exc);
7974 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 return NULL;
7976}
7977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978/* Deprecated. Use PyUnicode_Translate instead. */
7979PyObject *
7980PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7981 Py_ssize_t size,
7982 PyObject *mapping,
7983 const char *errors)
7984{
7985 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7986 if (!unicode)
7987 return NULL;
7988 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7989}
7990
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991PyObject *
7992PyUnicode_Translate(PyObject *str,
7993 PyObject *mapping,
7994 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
7996 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 str = PyUnicode_FromObject(str);
7999 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 Py_DECREF(str);
8003 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008004
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 Py_XDECREF(str);
8007 return NULL;
8008}
Tim Petersced69f82003-09-16 20:30:58 +00008009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008010static Py_UCS4
8011fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8012{
8013 /* No need to call PyUnicode_READY(self) because this function is only
8014 called as a callback from fixup() which does it already. */
8015 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8016 const int kind = PyUnicode_KIND(self);
8017 void *data = PyUnicode_DATA(self);
8018 Py_UCS4 maxchar = 0, ch, fixed;
8019 Py_ssize_t i;
8020
8021 for (i = 0; i < len; ++i) {
8022 ch = PyUnicode_READ(kind, data, i);
8023 fixed = 0;
8024 if (ch > 127) {
8025 if (Py_UNICODE_ISSPACE(ch))
8026 fixed = ' ';
8027 else {
8028 const int decimal = Py_UNICODE_TODECIMAL(ch);
8029 if (decimal >= 0)
8030 fixed = '0' + decimal;
8031 }
8032 if (fixed != 0) {
8033 if (fixed > maxchar)
8034 maxchar = fixed;
8035 PyUnicode_WRITE(kind, data, i, fixed);
8036 }
8037 else if (ch > maxchar)
8038 maxchar = ch;
8039 }
8040 else if (ch > maxchar)
8041 maxchar = ch;
8042 }
8043
8044 return maxchar;
8045}
8046
8047PyObject *
8048_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8049{
8050 if (!PyUnicode_Check(unicode)) {
8051 PyErr_BadInternalCall();
8052 return NULL;
8053 }
8054 if (PyUnicode_READY(unicode) == -1)
8055 return NULL;
8056 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8057 /* If the string is already ASCII, just return the same string */
8058 Py_INCREF(unicode);
8059 return unicode;
8060 }
8061 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8062}
8063
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008064PyObject *
8065PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8066 Py_ssize_t length)
8067{
8068 PyObject *result;
8069 Py_UNICODE *p; /* write pointer into result */
8070 Py_ssize_t i;
8071 /* Copy to a new string */
8072 result = (PyObject *)_PyUnicode_New(length);
8073 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8074 if (result == NULL)
8075 return result;
8076 p = PyUnicode_AS_UNICODE(result);
8077 /* Iterate over code points */
8078 for (i = 0; i < length; i++) {
8079 Py_UNICODE ch =s[i];
8080 if (ch > 127) {
8081 int decimal = Py_UNICODE_TODECIMAL(ch);
8082 if (decimal >= 0)
8083 p[i] = '0' + decimal;
8084 }
8085 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008086#ifndef DONT_MAKE_RESULT_READY
8087 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088 Py_DECREF(result);
8089 return NULL;
8090 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008091#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008092 return result;
8093}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008094/* --- Decimal Encoder ---------------------------------------------------- */
8095
Alexander Belopolsky40018472011-02-26 01:02:56 +00008096int
8097PyUnicode_EncodeDecimal(Py_UNICODE *s,
8098 Py_ssize_t length,
8099 char *output,
8100 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008101{
8102 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 PyObject *errorHandler = NULL;
8104 PyObject *exc = NULL;
8105 const char *encoding = "decimal";
8106 const char *reason = "invalid decimal Unicode string";
8107 /* the following variable is used for caching string comparisons
8108 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008110
8111 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 PyErr_BadArgument();
8113 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008114 }
8115
8116 p = s;
8117 end = s + length;
8118 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 register Py_UNICODE ch = *p;
8120 int decimal;
8121 PyObject *repunicode;
8122 Py_ssize_t repsize;
8123 Py_ssize_t newpos;
8124 Py_UNICODE *uni2;
8125 Py_UNICODE *collstart;
8126 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008127
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 ++p;
8131 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 decimal = Py_UNICODE_TODECIMAL(ch);
8134 if (decimal >= 0) {
8135 *output++ = '0' + decimal;
8136 ++p;
8137 continue;
8138 }
8139 if (0 < ch && ch < 256) {
8140 *output++ = (char)ch;
8141 ++p;
8142 continue;
8143 }
8144 /* All other characters are considered unencodable */
8145 collstart = p;
8146 collend = p+1;
8147 while (collend < end) {
8148 if ((0 < *collend && *collend < 256) ||
8149 !Py_UNICODE_ISSPACE(*collend) ||
8150 Py_UNICODE_TODECIMAL(*collend))
8151 break;
8152 }
8153 /* cache callback name lookup
8154 * (if not done yet, i.e. it's the first error) */
8155 if (known_errorHandler==-1) {
8156 if ((errors==NULL) || (!strcmp(errors, "strict")))
8157 known_errorHandler = 1;
8158 else if (!strcmp(errors, "replace"))
8159 known_errorHandler = 2;
8160 else if (!strcmp(errors, "ignore"))
8161 known_errorHandler = 3;
8162 else if (!strcmp(errors, "xmlcharrefreplace"))
8163 known_errorHandler = 4;
8164 else
8165 known_errorHandler = 0;
8166 }
8167 switch (known_errorHandler) {
8168 case 1: /* strict */
8169 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8170 goto onError;
8171 case 2: /* replace */
8172 for (p = collstart; p < collend; ++p)
8173 *output++ = '?';
8174 /* fall through */
8175 case 3: /* ignore */
8176 p = collend;
8177 break;
8178 case 4: /* xmlcharrefreplace */
8179 /* generate replacement (temporarily (mis)uses p) */
8180 for (p = collstart; p < collend; ++p)
8181 output += sprintf(output, "&#%d;", (int)*p);
8182 p = collend;
8183 break;
8184 default:
8185 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8186 encoding, reason, s, length, &exc,
8187 collstart-s, collend-s, &newpos);
8188 if (repunicode == NULL)
8189 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008190 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008191 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008192 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8193 Py_DECREF(repunicode);
8194 goto onError;
8195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 /* generate replacement */
8197 repsize = PyUnicode_GET_SIZE(repunicode);
8198 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8199 Py_UNICODE ch = *uni2;
8200 if (Py_UNICODE_ISSPACE(ch))
8201 *output++ = ' ';
8202 else {
8203 decimal = Py_UNICODE_TODECIMAL(ch);
8204 if (decimal >= 0)
8205 *output++ = '0' + decimal;
8206 else if (0 < ch && ch < 256)
8207 *output++ = (char)ch;
8208 else {
8209 Py_DECREF(repunicode);
8210 raise_encode_exception(&exc, encoding,
8211 s, length, collstart-s, collend-s, reason);
8212 goto onError;
8213 }
8214 }
8215 }
8216 p = s + newpos;
8217 Py_DECREF(repunicode);
8218 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008219 }
8220 /* 0-terminate the output string */
8221 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222 Py_XDECREF(exc);
8223 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008224 return 0;
8225
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 Py_XDECREF(exc);
8228 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008229 return -1;
8230}
8231
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232/* --- Helpers ------------------------------------------------------------ */
8233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234#include "stringlib/ucs1lib.h"
8235#include "stringlib/fastsearch.h"
8236#include "stringlib/partition.h"
8237#include "stringlib/split.h"
8238#include "stringlib/count.h"
8239#include "stringlib/find.h"
8240#include "stringlib/localeutil.h"
8241#include "stringlib/undef.h"
8242
8243#include "stringlib/ucs2lib.h"
8244#include "stringlib/fastsearch.h"
8245#include "stringlib/partition.h"
8246#include "stringlib/split.h"
8247#include "stringlib/count.h"
8248#include "stringlib/find.h"
8249#include "stringlib/localeutil.h"
8250#include "stringlib/undef.h"
8251
8252#include "stringlib/ucs4lib.h"
8253#include "stringlib/fastsearch.h"
8254#include "stringlib/partition.h"
8255#include "stringlib/split.h"
8256#include "stringlib/count.h"
8257#include "stringlib/find.h"
8258#include "stringlib/localeutil.h"
8259#include "stringlib/undef.h"
8260
8261static Py_ssize_t
8262any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8263 const Py_UCS1*, Py_ssize_t,
8264 Py_ssize_t, Py_ssize_t),
8265 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8266 const Py_UCS2*, Py_ssize_t,
8267 Py_ssize_t, Py_ssize_t),
8268 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8269 const Py_UCS4*, Py_ssize_t,
8270 Py_ssize_t, Py_ssize_t),
8271 PyObject* s1, PyObject* s2,
8272 Py_ssize_t start,
8273 Py_ssize_t end)
8274{
8275 int kind1, kind2, kind;
8276 void *buf1, *buf2;
8277 Py_ssize_t len1, len2, result;
8278
8279 kind1 = PyUnicode_KIND(s1);
8280 kind2 = PyUnicode_KIND(s2);
8281 kind = kind1 > kind2 ? kind1 : kind2;
8282 buf1 = PyUnicode_DATA(s1);
8283 buf2 = PyUnicode_DATA(s2);
8284 if (kind1 != kind)
8285 buf1 = _PyUnicode_AsKind(s1, kind);
8286 if (!buf1)
8287 return -2;
8288 if (kind2 != kind)
8289 buf2 = _PyUnicode_AsKind(s2, kind);
8290 if (!buf2) {
8291 if (kind1 != kind) PyMem_Free(buf1);
8292 return -2;
8293 }
8294 len1 = PyUnicode_GET_LENGTH(s1);
8295 len2 = PyUnicode_GET_LENGTH(s2);
8296
8297 switch(kind) {
8298 case PyUnicode_1BYTE_KIND:
8299 result = ucs1(buf1, len1, buf2, len2, start, end);
8300 break;
8301 case PyUnicode_2BYTE_KIND:
8302 result = ucs2(buf1, len1, buf2, len2, start, end);
8303 break;
8304 case PyUnicode_4BYTE_KIND:
8305 result = ucs4(buf1, len1, buf2, len2, start, end);
8306 break;
8307 default:
8308 assert(0); result = -2;
8309 }
8310
8311 if (kind1 != kind)
8312 PyMem_Free(buf1);
8313 if (kind2 != kind)
8314 PyMem_Free(buf2);
8315
8316 return result;
8317}
8318
8319Py_ssize_t
8320_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8321 Py_ssize_t n_buffer,
8322 void *digits, Py_ssize_t n_digits,
8323 Py_ssize_t min_width,
8324 const char *grouping,
8325 const char *thousands_sep)
8326{
8327 switch(kind) {
8328 case PyUnicode_1BYTE_KIND:
8329 return _PyUnicode_ucs1_InsertThousandsGrouping(
8330 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8331 min_width, grouping, thousands_sep);
8332 case PyUnicode_2BYTE_KIND:
8333 return _PyUnicode_ucs2_InsertThousandsGrouping(
8334 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8335 min_width, grouping, thousands_sep);
8336 case PyUnicode_4BYTE_KIND:
8337 return _PyUnicode_ucs4_InsertThousandsGrouping(
8338 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8339 min_width, grouping, thousands_sep);
8340 }
8341 assert(0);
8342 return -1;
8343}
8344
8345
Eric Smith8c663262007-08-25 02:26:07 +00008346#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008348
Thomas Wouters477c8d52006-05-27 19:21:47 +00008349#include "stringlib/count.h"
8350#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008351
Thomas Wouters477c8d52006-05-27 19:21:47 +00008352/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008353#define ADJUST_INDICES(start, end, len) \
8354 if (end > len) \
8355 end = len; \
8356 else if (end < 0) { \
8357 end += len; \
8358 if (end < 0) \
8359 end = 0; \
8360 } \
8361 if (start < 0) { \
8362 start += len; \
8363 if (start < 0) \
8364 start = 0; \
8365 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008366
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367Py_ssize_t
8368PyUnicode_Count(PyObject *str,
8369 PyObject *substr,
8370 Py_ssize_t start,
8371 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374 PyUnicodeObject* str_obj;
8375 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 int kind1, kind2, kind;
8377 void *buf1 = NULL, *buf2 = NULL;
8378 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008379
Thomas Wouters477c8d52006-05-27 19:21:47 +00008380 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008384 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 Py_DECREF(str_obj);
8386 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 }
Tim Petersced69f82003-09-16 20:30:58 +00008388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 kind1 = PyUnicode_KIND(str_obj);
8390 kind2 = PyUnicode_KIND(sub_obj);
8391 kind = kind1 > kind2 ? kind1 : kind2;
8392 buf1 = PyUnicode_DATA(str_obj);
8393 if (kind1 != kind)
8394 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8395 if (!buf1)
8396 goto onError;
8397 buf2 = PyUnicode_DATA(sub_obj);
8398 if (kind2 != kind)
8399 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8400 if (!buf2)
8401 goto onError;
8402 len1 = PyUnicode_GET_LENGTH(str_obj);
8403 len2 = PyUnicode_GET_LENGTH(sub_obj);
8404
8405 ADJUST_INDICES(start, end, len1);
8406 switch(kind) {
8407 case PyUnicode_1BYTE_KIND:
8408 result = ucs1lib_count(
8409 ((Py_UCS1*)buf1) + start, end - start,
8410 buf2, len2, PY_SSIZE_T_MAX
8411 );
8412 break;
8413 case PyUnicode_2BYTE_KIND:
8414 result = ucs2lib_count(
8415 ((Py_UCS2*)buf1) + start, end - start,
8416 buf2, len2, PY_SSIZE_T_MAX
8417 );
8418 break;
8419 case PyUnicode_4BYTE_KIND:
8420 result = ucs4lib_count(
8421 ((Py_UCS4*)buf1) + start, end - start,
8422 buf2, len2, PY_SSIZE_T_MAX
8423 );
8424 break;
8425 default:
8426 assert(0); result = 0;
8427 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008428
8429 Py_DECREF(sub_obj);
8430 Py_DECREF(str_obj);
8431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 if (kind1 != kind)
8433 PyMem_Free(buf1);
8434 if (kind2 != kind)
8435 PyMem_Free(buf2);
8436
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 onError:
8439 Py_DECREF(sub_obj);
8440 Py_DECREF(str_obj);
8441 if (kind1 != kind && buf1)
8442 PyMem_Free(buf1);
8443 if (kind2 != kind && buf2)
8444 PyMem_Free(buf2);
8445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446}
8447
Alexander Belopolsky40018472011-02-26 01:02:56 +00008448Py_ssize_t
8449PyUnicode_Find(PyObject *str,
8450 PyObject *sub,
8451 Py_ssize_t start,
8452 Py_ssize_t end,
8453 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008455 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008456
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008460 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 Py_DECREF(str);
8463 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 }
Tim Petersced69f82003-09-16 20:30:58 +00008465
Thomas Wouters477c8d52006-05-27 19:21:47 +00008466 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 result = any_find_slice(
8468 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8469 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008470 );
8471 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 result = any_find_slice(
8473 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8474 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008475 );
8476
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008478 Py_DECREF(sub);
8479
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 return result;
8481}
8482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483Py_ssize_t
8484PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8485 Py_ssize_t start, Py_ssize_t end,
8486 int direction)
8487{
8488 char *result;
8489 int kind;
8490 if (PyUnicode_READY(str) == -1)
8491 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008492 if (start < 0 || end < 0) {
8493 PyErr_SetString(PyExc_IndexError, "string index out of range");
8494 return -2;
8495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 if (end > PyUnicode_GET_LENGTH(str))
8497 end = PyUnicode_GET_LENGTH(str);
8498 kind = PyUnicode_KIND(str);
8499 result = findchar(PyUnicode_1BYTE_DATA(str)
8500 + PyUnicode_KIND_SIZE(kind, start),
8501 kind,
8502 end-start, ch, direction);
8503 if (!result)
8504 return -1;
8505 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8506}
8507
Alexander Belopolsky40018472011-02-26 01:02:56 +00008508static int
8509tailmatch(PyUnicodeObject *self,
8510 PyUnicodeObject *substring,
8511 Py_ssize_t start,
8512 Py_ssize_t end,
8513 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 int kind_self;
8516 int kind_sub;
8517 void *data_self;
8518 void *data_sub;
8519 Py_ssize_t offset;
8520 Py_ssize_t i;
8521 Py_ssize_t end_sub;
8522
8523 if (PyUnicode_READY(self) == -1 ||
8524 PyUnicode_READY(substring) == -1)
8525 return 0;
8526
8527 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 return 1;
8529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8531 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 kind_self = PyUnicode_KIND(self);
8536 data_self = PyUnicode_DATA(self);
8537 kind_sub = PyUnicode_KIND(substring);
8538 data_sub = PyUnicode_DATA(substring);
8539 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8540
8541 if (direction > 0)
8542 offset = end;
8543 else
8544 offset = start;
8545
8546 if (PyUnicode_READ(kind_self, data_self, offset) ==
8547 PyUnicode_READ(kind_sub, data_sub, 0) &&
8548 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8549 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8550 /* If both are of the same kind, memcmp is sufficient */
8551 if (kind_self == kind_sub) {
8552 return ! memcmp((char *)data_self +
8553 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8554 data_sub,
8555 PyUnicode_GET_LENGTH(substring) *
8556 PyUnicode_CHARACTER_SIZE(substring));
8557 }
8558 /* otherwise we have to compare each character by first accesing it */
8559 else {
8560 /* We do not need to compare 0 and len(substring)-1 because
8561 the if statement above ensured already that they are equal
8562 when we end up here. */
8563 // TODO: honor direction and do a forward or backwards search
8564 for (i = 1; i < end_sub; ++i) {
8565 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8566 PyUnicode_READ(kind_sub, data_sub, i))
8567 return 0;
8568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
8572
8573 return 0;
8574}
8575
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576Py_ssize_t
8577PyUnicode_Tailmatch(PyObject *str,
8578 PyObject *substr,
8579 Py_ssize_t start,
8580 Py_ssize_t end,
8581 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008583 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008584
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 str = PyUnicode_FromObject(str);
8586 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 substr = PyUnicode_FromObject(substr);
8589 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 Py_DECREF(str);
8591 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 }
Tim Petersced69f82003-09-16 20:30:58 +00008593
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 (PyUnicodeObject *)substr,
8596 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 Py_DECREF(str);
8598 Py_DECREF(substr);
8599 return result;
8600}
8601
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602/* Apply fixfct filter to the Unicode object self and return a
8603 reference to the modified object */
8604
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static PyObject *
8606fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 PyObject *u;
8610 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (PyUnicode_READY(self) == -1)
8613 return NULL;
8614 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8615 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8616 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8621 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 /* fix functions return the new maximum character in a string,
8624 if the kind of the resulting unicode object does not change,
8625 everything is fine. Otherwise we need to change the string kind
8626 and re-run the fix function. */
8627 maxchar_new = fixfct((PyUnicodeObject*)u);
8628 if (maxchar_new == 0)
8629 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8630 else if (maxchar_new <= 127)
8631 maxchar_new = 127;
8632 else if (maxchar_new <= 255)
8633 maxchar_new = 255;
8634 else if (maxchar_new <= 65535)
8635 maxchar_new = 65535;
8636 else
8637 maxchar_new = 1114111; /* 0x10ffff */
8638
8639 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 /* fixfct should return TRUE if it modified the buffer. If
8641 FALSE, return a reference to the original buffer instead
8642 (to save space, not time) */
8643 Py_INCREF(self);
8644 Py_DECREF(u);
8645 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 else if (maxchar_new == maxchar_old) {
8648 return u;
8649 }
8650 else {
8651 /* In case the maximum character changed, we need to
8652 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008653 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 if (v == NULL) {
8655 Py_DECREF(u);
8656 return NULL;
8657 }
8658 if (maxchar_new > maxchar_old) {
8659 /* If the maxchar increased so that the kind changed, not all
8660 characters are representable anymore and we need to fix the
8661 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008662 if (PyUnicode_CopyCharacters(v, 0,
8663 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008664 PyUnicode_GET_LENGTH(self)) < 0)
8665 {
8666 Py_DECREF(u);
8667 return NULL;
8668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 maxchar_old = fixfct((PyUnicodeObject*)v);
8670 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8671 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008672 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008673 if (PyUnicode_CopyCharacters(v, 0,
8674 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008675 PyUnicode_GET_LENGTH(self)) < 0)
8676 {
8677 Py_DECREF(u);
8678 return NULL;
8679 }
8680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681
8682 Py_DECREF(u);
8683 return v;
8684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685}
8686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 /* No need to call PyUnicode_READY(self) because this function is only
8691 called as a callback from fixup() which does it already. */
8692 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8693 const int kind = PyUnicode_KIND(self);
8694 void *data = PyUnicode_DATA(self);
8695 int touched = 0;
8696 Py_UCS4 maxchar = 0;
8697 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 for (i = 0; i < len; ++i) {
8700 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8701 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8702 if (up != ch) {
8703 if (up > maxchar)
8704 maxchar = up;
8705 PyUnicode_WRITE(kind, data, i, up);
8706 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 else if (ch > maxchar)
8709 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 }
8711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (touched)
8713 return maxchar;
8714 else
8715 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716}
8717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008719fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8722 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8723 const int kind = PyUnicode_KIND(self);
8724 void *data = PyUnicode_DATA(self);
8725 int touched = 0;
8726 Py_UCS4 maxchar = 0;
8727 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 for(i = 0; i < len; ++i) {
8730 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8731 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8732 if (lo != ch) {
8733 if (lo > maxchar)
8734 maxchar = lo;
8735 PyUnicode_WRITE(kind, data, i, lo);
8736 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 else if (ch > maxchar)
8739 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 }
8741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 if (touched)
8743 return maxchar;
8744 else
8745 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746}
8747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008749fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8752 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8753 const int kind = PyUnicode_KIND(self);
8754 void *data = PyUnicode_DATA(self);
8755 int touched = 0;
8756 Py_UCS4 maxchar = 0;
8757 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 for(i = 0; i < len; ++i) {
8760 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8761 Py_UCS4 nu = 0;
8762
8763 if (Py_UNICODE_ISUPPER(ch))
8764 nu = Py_UNICODE_TOLOWER(ch);
8765 else if (Py_UNICODE_ISLOWER(ch))
8766 nu = Py_UNICODE_TOUPPER(ch);
8767
8768 if (nu != 0) {
8769 if (nu > maxchar)
8770 maxchar = nu;
8771 PyUnicode_WRITE(kind, data, i, nu);
8772 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 else if (ch > maxchar)
8775 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 }
8777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 if (touched)
8779 return maxchar;
8780 else
8781 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8788 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8789 const int kind = PyUnicode_KIND(self);
8790 void *data = PyUnicode_DATA(self);
8791 int touched = 0;
8792 Py_UCS4 maxchar = 0;
8793 Py_ssize_t i = 0;
8794 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008795
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008796 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798
8799 ch = PyUnicode_READ(kind, data, i);
8800 if (!Py_UNICODE_ISUPPER(ch)) {
8801 maxchar = Py_UNICODE_TOUPPER(ch);
8802 PyUnicode_WRITE(kind, data, i, maxchar);
8803 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 ++i;
8806 for(; i < len; ++i) {
8807 ch = PyUnicode_READ(kind, data, i);
8808 if (!Py_UNICODE_ISLOWER(ch)) {
8809 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8810 if (lo > maxchar)
8811 maxchar = lo;
8812 PyUnicode_WRITE(kind, data, i, lo);
8813 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 else if (ch > maxchar)
8816 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818
8819 if (touched)
8820 return maxchar;
8821 else
8822 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823}
8824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008826fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8829 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8830 const int kind = PyUnicode_KIND(self);
8831 void *data = PyUnicode_DATA(self);
8832 Py_UCS4 maxchar = 0;
8833 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 int previous_is_cased;
8835
8836 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 if (len == 1) {
8838 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8839 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8840 if (ti != ch) {
8841 PyUnicode_WRITE(kind, data, i, ti);
8842 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 }
8844 else
8845 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 for(; i < len; ++i) {
8849 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8850 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008851
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 nu = Py_UNICODE_TOTITLE(ch);
8856
8857 if (nu > maxchar)
8858 maxchar = nu;
8859 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 if (Py_UNICODE_ISLOWER(ch) ||
8862 Py_UNICODE_ISUPPER(ch) ||
8863 Py_UNICODE_ISTITLE(ch))
8864 previous_is_cased = 1;
8865 else
8866 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869}
8870
Tim Peters8ce9f162004-08-27 01:49:32 +00008871PyObject *
8872PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008875 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008877 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008878 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8879 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008880 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 Py_ssize_t sz, i, res_offset;
8882 Py_UCS4 maxchar = 0;
8883 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884
Tim Peters05eba1f2004-08-27 21:32:02 +00008885 fseq = PySequence_Fast(seq, "");
8886 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008888 }
8889
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008890 /* NOTE: the following code can't call back into Python code,
8891 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008892 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008893
Tim Peters05eba1f2004-08-27 21:32:02 +00008894 seqlen = PySequence_Fast_GET_SIZE(fseq);
8895 /* If empty sequence, return u"". */
8896 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008899 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008900 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008901 /* If singleton sequence with an exact Unicode, return that. */
8902 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 item = items[0];
8904 if (PyUnicode_CheckExact(item)) {
8905 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 goto Done;
8908 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008909 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008910 else {
8911 /* Set up sep and seplen */
8912 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 /* fall back to a blank space separator */
8914 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008915 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008917 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008918 else {
8919 if (!PyUnicode_Check(separator)) {
8920 PyErr_Format(PyExc_TypeError,
8921 "separator: expected str instance,"
8922 " %.80s found",
8923 Py_TYPE(separator)->tp_name);
8924 goto onError;
8925 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008926 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 goto onError;
8928 sep = separator;
8929 seplen = PyUnicode_GET_LENGTH(separator);
8930 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8931 /* inc refcount to keep this code path symetric with the
8932 above case of a blank separator */
8933 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008934 }
8935 }
8936
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008937 /* There are at least two things to join, or else we have a subclass
8938 * of str in the sequence.
8939 * Do a pre-pass to figure out the total amount of space we'll
8940 * need (sz), and see whether all argument are strings.
8941 */
8942 sz = 0;
8943 for (i = 0; i < seqlen; i++) {
8944 const Py_ssize_t old_sz = sz;
8945 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 if (!PyUnicode_Check(item)) {
8947 PyErr_Format(PyExc_TypeError,
8948 "sequence item %zd: expected str instance,"
8949 " %.80s found",
8950 i, Py_TYPE(item)->tp_name);
8951 goto onError;
8952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 if (PyUnicode_READY(item) == -1)
8954 goto onError;
8955 sz += PyUnicode_GET_LENGTH(item);
8956 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8957 if (item_maxchar > maxchar)
8958 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008959 if (i != 0)
8960 sz += seplen;
8961 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8962 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008964 goto onError;
8965 }
8966 }
Tim Petersced69f82003-09-16 20:30:58 +00008967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008969 if (res == NULL)
8970 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008971
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008972 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008974 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008975 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008977 if (i && seplen != 0) {
8978 copied = PyUnicode_CopyCharacters(res, res_offset,
8979 sep, 0, seplen);
8980 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008981 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008982#ifdef Py_DEBUG
8983 res_offset += copied;
8984#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008986#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008988 itemlen = PyUnicode_GET_LENGTH(item);
8989 if (itemlen != 0) {
8990 copied = PyUnicode_CopyCharacters(res, res_offset,
8991 item, 0, itemlen);
8992 if (copied < 0)
8993 goto onError;
8994#ifdef Py_DEBUG
8995 res_offset += copied;
8996#else
8997 res_offset += itemlen;
8998#endif
8999 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009002
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009004 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 Py_XDECREF(sep);
9006 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009009 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009011 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 return NULL;
9013}
9014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015#define FILL(kind, data, value, start, length) \
9016 do { \
9017 Py_ssize_t i_ = 0; \
9018 assert(kind != PyUnicode_WCHAR_KIND); \
9019 switch ((kind)) { \
9020 case PyUnicode_1BYTE_KIND: { \
9021 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9022 memset(to_, (unsigned char)value, length); \
9023 break; \
9024 } \
9025 case PyUnicode_2BYTE_KIND: { \
9026 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9027 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9028 break; \
9029 } \
9030 default: { \
9031 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9032 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9033 break; \
9034 } \
9035 } \
9036 } while (0)
9037
Alexander Belopolsky40018472011-02-26 01:02:56 +00009038static PyUnicodeObject *
9039pad(PyUnicodeObject *self,
9040 Py_ssize_t left,
9041 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 PyObject *u;
9045 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009046 int kind;
9047 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048
9049 if (left < 0)
9050 left = 0;
9051 if (right < 0)
9052 right = 0;
9053
Tim Peters7a29bd52001-09-12 03:03:31 +00009054 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 Py_INCREF(self);
9056 return self;
9057 }
9058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9060 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009061 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9062 return NULL;
9063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9065 if (fill > maxchar)
9066 maxchar = fill;
9067 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009068 if (!u)
9069 return NULL;
9070
9071 kind = PyUnicode_KIND(u);
9072 data = PyUnicode_DATA(u);
9073 if (left)
9074 FILL(kind, data, fill, 0, left);
9075 if (right)
9076 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009077 if (PyUnicode_CopyCharacters(u, left,
9078 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009079 _PyUnicode_LENGTH(self)) < 0)
9080 {
9081 Py_DECREF(u);
9082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 }
9084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088
Alexander Belopolsky40018472011-02-26 01:02:56 +00009089PyObject *
9090PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
9094 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 switch(PyUnicode_KIND(string)) {
9099 case PyUnicode_1BYTE_KIND:
9100 list = ucs1lib_splitlines(
9101 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9102 PyUnicode_GET_LENGTH(string), keepends);
9103 break;
9104 case PyUnicode_2BYTE_KIND:
9105 list = ucs2lib_splitlines(
9106 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9107 PyUnicode_GET_LENGTH(string), keepends);
9108 break;
9109 case PyUnicode_4BYTE_KIND:
9110 list = ucs4lib_splitlines(
9111 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9112 PyUnicode_GET_LENGTH(string), keepends);
9113 break;
9114 default:
9115 assert(0);
9116 list = 0;
9117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 Py_DECREF(string);
9119 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120}
9121
Alexander Belopolsky40018472011-02-26 01:02:56 +00009122static PyObject *
9123split(PyUnicodeObject *self,
9124 PyUnicodeObject *substring,
9125 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 int kind1, kind2, kind;
9128 void *buf1, *buf2;
9129 Py_ssize_t len1, len2;
9130 PyObject* out;
9131
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009133 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135 if (PyUnicode_READY(self) == -1)
9136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 if (substring == NULL)
9139 switch(PyUnicode_KIND(self)) {
9140 case PyUnicode_1BYTE_KIND:
9141 return ucs1lib_split_whitespace(
9142 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9143 PyUnicode_GET_LENGTH(self), maxcount
9144 );
9145 case PyUnicode_2BYTE_KIND:
9146 return ucs2lib_split_whitespace(
9147 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9148 PyUnicode_GET_LENGTH(self), maxcount
9149 );
9150 case PyUnicode_4BYTE_KIND:
9151 return ucs4lib_split_whitespace(
9152 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9153 PyUnicode_GET_LENGTH(self), maxcount
9154 );
9155 default:
9156 assert(0);
9157 return NULL;
9158 }
9159
9160 if (PyUnicode_READY(substring) == -1)
9161 return NULL;
9162
9163 kind1 = PyUnicode_KIND(self);
9164 kind2 = PyUnicode_KIND(substring);
9165 kind = kind1 > kind2 ? kind1 : kind2;
9166 buf1 = PyUnicode_DATA(self);
9167 buf2 = PyUnicode_DATA(substring);
9168 if (kind1 != kind)
9169 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9170 if (!buf1)
9171 return NULL;
9172 if (kind2 != kind)
9173 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9174 if (!buf2) {
9175 if (kind1 != kind) PyMem_Free(buf1);
9176 return NULL;
9177 }
9178 len1 = PyUnicode_GET_LENGTH(self);
9179 len2 = PyUnicode_GET_LENGTH(substring);
9180
9181 switch(kind) {
9182 case PyUnicode_1BYTE_KIND:
9183 out = ucs1lib_split(
9184 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9185 break;
9186 case PyUnicode_2BYTE_KIND:
9187 out = ucs2lib_split(
9188 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9189 break;
9190 case PyUnicode_4BYTE_KIND:
9191 out = ucs4lib_split(
9192 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9193 break;
9194 default:
9195 out = NULL;
9196 }
9197 if (kind1 != kind)
9198 PyMem_Free(buf1);
9199 if (kind2 != kind)
9200 PyMem_Free(buf2);
9201 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202}
9203
Alexander Belopolsky40018472011-02-26 01:02:56 +00009204static PyObject *
9205rsplit(PyUnicodeObject *self,
9206 PyUnicodeObject *substring,
9207 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 int kind1, kind2, kind;
9210 void *buf1, *buf2;
9211 Py_ssize_t len1, len2;
9212 PyObject* out;
9213
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009214 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009215 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (PyUnicode_READY(self) == -1)
9218 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 if (substring == NULL)
9221 switch(PyUnicode_KIND(self)) {
9222 case PyUnicode_1BYTE_KIND:
9223 return ucs1lib_rsplit_whitespace(
9224 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9225 PyUnicode_GET_LENGTH(self), maxcount
9226 );
9227 case PyUnicode_2BYTE_KIND:
9228 return ucs2lib_rsplit_whitespace(
9229 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9230 PyUnicode_GET_LENGTH(self), maxcount
9231 );
9232 case PyUnicode_4BYTE_KIND:
9233 return ucs4lib_rsplit_whitespace(
9234 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9235 PyUnicode_GET_LENGTH(self), maxcount
9236 );
9237 default:
9238 assert(0);
9239 return NULL;
9240 }
9241
9242 if (PyUnicode_READY(substring) == -1)
9243 return NULL;
9244
9245 kind1 = PyUnicode_KIND(self);
9246 kind2 = PyUnicode_KIND(substring);
9247 kind = kind1 > kind2 ? kind1 : kind2;
9248 buf1 = PyUnicode_DATA(self);
9249 buf2 = PyUnicode_DATA(substring);
9250 if (kind1 != kind)
9251 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9252 if (!buf1)
9253 return NULL;
9254 if (kind2 != kind)
9255 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9256 if (!buf2) {
9257 if (kind1 != kind) PyMem_Free(buf1);
9258 return NULL;
9259 }
9260 len1 = PyUnicode_GET_LENGTH(self);
9261 len2 = PyUnicode_GET_LENGTH(substring);
9262
9263 switch(kind) {
9264 case PyUnicode_1BYTE_KIND:
9265 out = ucs1lib_rsplit(
9266 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9267 break;
9268 case PyUnicode_2BYTE_KIND:
9269 out = ucs2lib_rsplit(
9270 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9271 break;
9272 case PyUnicode_4BYTE_KIND:
9273 out = ucs4lib_rsplit(
9274 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9275 break;
9276 default:
9277 out = NULL;
9278 }
9279 if (kind1 != kind)
9280 PyMem_Free(buf1);
9281 if (kind2 != kind)
9282 PyMem_Free(buf2);
9283 return out;
9284}
9285
9286static Py_ssize_t
9287anylib_find(int kind, void *buf1, Py_ssize_t len1,
9288 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9289{
9290 switch(kind) {
9291 case PyUnicode_1BYTE_KIND:
9292 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9293 case PyUnicode_2BYTE_KIND:
9294 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9295 case PyUnicode_4BYTE_KIND:
9296 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9297 }
9298 assert(0);
9299 return -1;
9300}
9301
9302static Py_ssize_t
9303anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9304 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9305{
9306 switch(kind) {
9307 case PyUnicode_1BYTE_KIND:
9308 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9309 case PyUnicode_2BYTE_KIND:
9310 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9311 case PyUnicode_4BYTE_KIND:
9312 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9313 }
9314 assert(0);
9315 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009316}
9317
Alexander Belopolsky40018472011-02-26 01:02:56 +00009318static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319replace(PyObject *self, PyObject *str1,
9320 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 PyObject *u;
9323 char *sbuf = PyUnicode_DATA(self);
9324 char *buf1 = PyUnicode_DATA(str1);
9325 char *buf2 = PyUnicode_DATA(str2);
9326 int srelease = 0, release1 = 0, release2 = 0;
9327 int skind = PyUnicode_KIND(self);
9328 int kind1 = PyUnicode_KIND(str1);
9329 int kind2 = PyUnicode_KIND(str2);
9330 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9331 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9332 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333
9334 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009337 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 if (skind < kind1)
9340 /* substring too wide to be present */
9341 goto nothing;
9342
9343 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009344 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009347 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009349 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 Py_UCS4 u1, u2, maxchar;
9351 int mayshrink, rkind;
9352 u1 = PyUnicode_READ_CHAR(str1, 0);
9353 if (!findchar(sbuf, PyUnicode_KIND(self),
9354 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 u2 = PyUnicode_READ_CHAR(str2, 0);
9357 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9358 /* Replacing u1 with u2 may cause a maxchar reduction in the
9359 result string. */
9360 mayshrink = maxchar > 127;
9361 if (u2 > maxchar) {
9362 maxchar = u2;
9363 mayshrink = 0;
9364 }
9365 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009366 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009368 if (PyUnicode_CopyCharacters(u, 0,
9369 (PyObject*)self, 0, slen) < 0)
9370 {
9371 Py_DECREF(u);
9372 return NULL;
9373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 rkind = PyUnicode_KIND(u);
9375 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9376 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009377 if (--maxcount < 0)
9378 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 if (mayshrink) {
9382 PyObject *tmp = u;
9383 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9384 PyUnicode_GET_LENGTH(tmp));
9385 Py_DECREF(tmp);
9386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 int rkind = skind;
9389 char *res;
9390 if (kind1 < rkind) {
9391 /* widen substring */
9392 buf1 = _PyUnicode_AsKind(str1, rkind);
9393 if (!buf1) goto error;
9394 release1 = 1;
9395 }
9396 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009397 if (i < 0)
9398 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 if (rkind > kind2) {
9400 /* widen replacement */
9401 buf2 = _PyUnicode_AsKind(str2, rkind);
9402 if (!buf2) goto error;
9403 release2 = 1;
9404 }
9405 else if (rkind < kind2) {
9406 /* widen self and buf1 */
9407 rkind = kind2;
9408 if (release1) PyMem_Free(buf1);
9409 sbuf = _PyUnicode_AsKind(self, rkind);
9410 if (!sbuf) goto error;
9411 srelease = 1;
9412 buf1 = _PyUnicode_AsKind(str1, rkind);
9413 if (!buf1) goto error;
9414 release1 = 1;
9415 }
9416 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9417 if (!res) {
9418 PyErr_NoMemory();
9419 goto error;
9420 }
9421 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009422 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9424 buf2,
9425 PyUnicode_KIND_SIZE(rkind, len2));
9426 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009427
9428 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9430 slen-i,
9431 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009432 if (i == -1)
9433 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9435 buf2,
9436 PyUnicode_KIND_SIZE(rkind, len2));
9437 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439
9440 u = PyUnicode_FromKindAndData(rkind, res, slen);
9441 PyMem_Free(res);
9442 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 Py_ssize_t n, i, j, ires;
9447 Py_ssize_t product, new_size;
9448 int rkind = skind;
9449 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 if (kind1 < rkind) {
9452 buf1 = _PyUnicode_AsKind(str1, rkind);
9453 if (!buf1) goto error;
9454 release1 = 1;
9455 }
9456 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009457 if (n == 0)
9458 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 if (kind2 < rkind) {
9460 buf2 = _PyUnicode_AsKind(str2, rkind);
9461 if (!buf2) goto error;
9462 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 else if (kind2 > rkind) {
9465 rkind = kind2;
9466 sbuf = _PyUnicode_AsKind(self, rkind);
9467 if (!sbuf) goto error;
9468 srelease = 1;
9469 if (release1) PyMem_Free(buf1);
9470 buf1 = _PyUnicode_AsKind(str1, rkind);
9471 if (!buf1) goto error;
9472 release1 = 1;
9473 }
9474 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9475 PyUnicode_GET_LENGTH(str1))); */
9476 product = n * (len2-len1);
9477 if ((product / (len2-len1)) != n) {
9478 PyErr_SetString(PyExc_OverflowError,
9479 "replace string is too long");
9480 goto error;
9481 }
9482 new_size = slen + product;
9483 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9484 PyErr_SetString(PyExc_OverflowError,
9485 "replace string is too long");
9486 goto error;
9487 }
9488 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9489 if (!res)
9490 goto error;
9491 ires = i = 0;
9492 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009493 while (n-- > 0) {
9494 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 j = anylib_find(rkind,
9496 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9497 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009498 if (j == -1)
9499 break;
9500 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009501 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9503 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9504 PyUnicode_KIND_SIZE(rkind, j-i));
9505 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009506 }
9507 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 if (len2 > 0) {
9509 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9510 buf2,
9511 PyUnicode_KIND_SIZE(rkind, len2));
9512 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009517 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9519 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9520 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009521 } else {
9522 /* interleave */
9523 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9525 buf2,
9526 PyUnicode_KIND_SIZE(rkind, len2));
9527 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009528 if (--n <= 0)
9529 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9531 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9532 PyUnicode_KIND_SIZE(rkind, 1));
9533 ires++;
9534 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9537 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9538 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009541 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 if (srelease)
9544 PyMem_FREE(sbuf);
9545 if (release1)
9546 PyMem_FREE(buf1);
9547 if (release2)
9548 PyMem_FREE(buf2);
9549 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009550
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009552 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 if (srelease)
9554 PyMem_FREE(sbuf);
9555 if (release1)
9556 PyMem_FREE(buf1);
9557 if (release2)
9558 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009559 if (PyUnicode_CheckExact(self)) {
9560 Py_INCREF(self);
9561 return (PyObject *) self;
9562 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009563 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 error:
9565 if (srelease && sbuf)
9566 PyMem_FREE(sbuf);
9567 if (release1 && buf1)
9568 PyMem_FREE(buf1);
9569 if (release2 && buf2)
9570 PyMem_FREE(buf2);
9571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572}
9573
9574/* --- Unicode Object Methods --------------------------------------------- */
9575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009576PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578\n\
9579Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009580characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
9582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009583unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585 return fixup(self, fixtitle);
9586}
9587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009588PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590\n\
9591Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009592have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593
9594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009595unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 return fixup(self, fixcapitalize);
9598}
9599
9600#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009601PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603\n\
9604Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009605normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
9607static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009608unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
9610 PyObject *list;
9611 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009612 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614 /* Split into words */
9615 list = split(self, NULL, -1);
9616 if (!list)
9617 return NULL;
9618
9619 /* Capitalize each word */
9620 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9621 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 if (item == NULL)
9624 goto onError;
9625 Py_DECREF(PyList_GET_ITEM(list, i));
9626 PyList_SET_ITEM(list, i, item);
9627 }
9628
9629 /* Join the words to form a new string */
9630 item = PyUnicode_Join(NULL, list);
9631
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633 Py_DECREF(list);
9634 return (PyObject *)item;
9635}
9636#endif
9637
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009638/* Argument converter. Coerces to a single unicode character */
9639
9640static int
9641convert_uc(PyObject *obj, void *addr)
9642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009644 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009645
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 uniobj = PyUnicode_FromObject(obj);
9647 if (uniobj == NULL) {
9648 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009650 return 0;
9651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009653 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 Py_DECREF(uniobj);
9656 return 0;
9657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009659 Py_DECREF(uniobj);
9660 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009661}
9662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009663PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009666Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009667done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668
9669static PyObject *
9670unicode_center(PyUnicodeObject *self, PyObject *args)
9671{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009672 Py_ssize_t marg, left;
9673 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_UCS4 fillchar = ' ';
9675
Victor Stinnere9a29352011-10-01 02:14:59 +02009676 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678
Victor Stinnere9a29352011-10-01 02:14:59 +02009679 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 return NULL;
9681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683 Py_INCREF(self);
9684 return (PyObject*) self;
9685 }
9686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 left = marg / 2 + (marg & width & 1);
9689
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009690 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
Marc-André Lemburge5034372000-08-08 08:04:29 +00009693#if 0
9694
9695/* This code should go into some future Unicode collation support
9696 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009697 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009698
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009699/* speedy UTF-16 code point order comparison */
9700/* gleaned from: */
9701/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009703static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009704{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009705 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009706 0, 0, 0, 0, 0, 0, 0, 0,
9707 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009708 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009709};
9710
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711static int
9712unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9713{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009714 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009715
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 Py_UNICODE *s1 = str1->str;
9717 Py_UNICODE *s2 = str2->str;
9718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 len1 = str1->_base._base.length;
9720 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009721
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009723 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009724
9725 c1 = *s1++;
9726 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009727
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 if (c1 > (1<<11) * 26)
9729 c1 += utf16Fixup[c1>>11];
9730 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009731 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009732 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009733
9734 if (c1 != c2)
9735 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009736
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009737 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 }
9739
9740 return (len1 < len2) ? -1 : (len1 != len2);
9741}
9742
Marc-André Lemburge5034372000-08-08 08:04:29 +00009743#else
9744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745/* This function assumes that str1 and str2 are readied by the caller. */
9746
Marc-André Lemburge5034372000-08-08 08:04:29 +00009747static int
9748unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 int kind1, kind2;
9751 void *data1, *data2;
9752 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 kind1 = PyUnicode_KIND(str1);
9755 kind2 = PyUnicode_KIND(str2);
9756 data1 = PyUnicode_DATA(str1);
9757 data2 = PyUnicode_DATA(str2);
9758 len1 = PyUnicode_GET_LENGTH(str1);
9759 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 for (i = 0; i < len1 && i < len2; ++i) {
9762 Py_UCS4 c1, c2;
9763 c1 = PyUnicode_READ(kind1, data1, i);
9764 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009765
9766 if (c1 != c2)
9767 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009768 }
9769
9770 return (len1 < len2) ? -1 : (len1 != len2);
9771}
9772
9773#endif
9774
Alexander Belopolsky40018472011-02-26 01:02:56 +00009775int
9776PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9779 if (PyUnicode_READY(left) == -1 ||
9780 PyUnicode_READY(right) == -1)
9781 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009782 return unicode_compare((PyUnicodeObject *)left,
9783 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009785 PyErr_Format(PyExc_TypeError,
9786 "Can't compare %.100s and %.100s",
9787 left->ob_type->tp_name,
9788 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 return -1;
9790}
9791
Martin v. Löwis5b222132007-06-10 09:51:05 +00009792int
9793PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 Py_ssize_t i;
9796 int kind;
9797 void *data;
9798 Py_UCS4 chr;
9799
Victor Stinner910337b2011-10-03 03:20:16 +02009800 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 if (PyUnicode_READY(uni) == -1)
9802 return -1;
9803 kind = PyUnicode_KIND(uni);
9804 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009805 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9807 if (chr != str[i])
9808 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009809 /* This check keeps Python strings that end in '\0' from comparing equal
9810 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009813 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009815 return 0;
9816}
9817
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009818
Benjamin Peterson29060642009-01-31 22:14:21 +00009819#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009820 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009821
Alexander Belopolsky40018472011-02-26 01:02:56 +00009822PyObject *
9823PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009824{
9825 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009827 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9828 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 if (PyUnicode_READY(left) == -1 ||
9830 PyUnicode_READY(right) == -1)
9831 return NULL;
9832 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9833 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009834 if (op == Py_EQ) {
9835 Py_INCREF(Py_False);
9836 return Py_False;
9837 }
9838 if (op == Py_NE) {
9839 Py_INCREF(Py_True);
9840 return Py_True;
9841 }
9842 }
9843 if (left == right)
9844 result = 0;
9845 else
9846 result = unicode_compare((PyUnicodeObject *)left,
9847 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009849 /* Convert the return value to a Boolean */
9850 switch (op) {
9851 case Py_EQ:
9852 v = TEST_COND(result == 0);
9853 break;
9854 case Py_NE:
9855 v = TEST_COND(result != 0);
9856 break;
9857 case Py_LE:
9858 v = TEST_COND(result <= 0);
9859 break;
9860 case Py_GE:
9861 v = TEST_COND(result >= 0);
9862 break;
9863 case Py_LT:
9864 v = TEST_COND(result == -1);
9865 break;
9866 case Py_GT:
9867 v = TEST_COND(result == 1);
9868 break;
9869 default:
9870 PyErr_BadArgument();
9871 return NULL;
9872 }
9873 Py_INCREF(v);
9874 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009875 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876
Brian Curtindfc80e32011-08-10 20:28:54 -05009877 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009878}
9879
Alexander Belopolsky40018472011-02-26 01:02:56 +00009880int
9881PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009882{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009883 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 int kind1, kind2, kind;
9885 void *buf1, *buf2;
9886 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009887 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009888
9889 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009890 sub = PyUnicode_FromObject(element);
9891 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 PyErr_Format(PyExc_TypeError,
9893 "'in <string>' requires string as left operand, not %s",
9894 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009895 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (PyUnicode_READY(sub) == -1)
9898 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009899
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009901 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009902 Py_DECREF(sub);
9903 return -1;
9904 }
9905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 kind1 = PyUnicode_KIND(str);
9907 kind2 = PyUnicode_KIND(sub);
9908 kind = kind1 > kind2 ? kind1 : kind2;
9909 buf1 = PyUnicode_DATA(str);
9910 buf2 = PyUnicode_DATA(sub);
9911 if (kind1 != kind)
9912 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9913 if (!buf1) {
9914 Py_DECREF(sub);
9915 return -1;
9916 }
9917 if (kind2 != kind)
9918 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9919 if (!buf2) {
9920 Py_DECREF(sub);
9921 if (kind1 != kind) PyMem_Free(buf1);
9922 return -1;
9923 }
9924 len1 = PyUnicode_GET_LENGTH(str);
9925 len2 = PyUnicode_GET_LENGTH(sub);
9926
9927 switch(kind) {
9928 case PyUnicode_1BYTE_KIND:
9929 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9930 break;
9931 case PyUnicode_2BYTE_KIND:
9932 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9933 break;
9934 case PyUnicode_4BYTE_KIND:
9935 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9936 break;
9937 default:
9938 result = -1;
9939 assert(0);
9940 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009941
9942 Py_DECREF(str);
9943 Py_DECREF(sub);
9944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (kind1 != kind)
9946 PyMem_Free(buf1);
9947 if (kind2 != kind)
9948 PyMem_Free(buf2);
9949
Guido van Rossum403d68b2000-03-13 15:55:09 +00009950 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009951}
9952
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953/* Concat to string or Unicode object giving a new Unicode object. */
9954
Alexander Belopolsky40018472011-02-26 01:02:56 +00009955PyObject *
9956PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 PyObject *u = NULL, *v = NULL, *w;
9959 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960
9961 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
9969 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009970 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009974 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977 }
9978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009980 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 w = PyUnicode_New(
9984 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9985 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009988 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9989 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009990 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009991 v, 0,
9992 PyUnicode_GET_LENGTH(v)) < 0)
9993 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994 Py_DECREF(u);
9995 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999 Py_XDECREF(u);
10000 Py_XDECREF(v);
10001 return NULL;
10002}
10003
Victor Stinnerb0923652011-10-04 01:17:31 +020010004static void
10005unicode_append_inplace(PyObject **p_left, PyObject *right)
10006{
10007 Py_ssize_t left_len, right_len, new_len;
10008#ifdef Py_DEBUG
10009 Py_ssize_t copied;
10010#endif
10011
10012 assert(PyUnicode_IS_READY(*p_left));
10013 assert(PyUnicode_IS_READY(right));
10014
10015 left_len = PyUnicode_GET_LENGTH(*p_left);
10016 right_len = PyUnicode_GET_LENGTH(right);
10017 if (left_len > PY_SSIZE_T_MAX - right_len) {
10018 PyErr_SetString(PyExc_OverflowError,
10019 "strings are too large to concat");
10020 goto error;
10021 }
10022 new_len = left_len + right_len;
10023
10024 /* Now we own the last reference to 'left', so we can resize it
10025 * in-place.
10026 */
10027 if (unicode_resize(p_left, new_len) != 0) {
10028 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10029 * deallocated so it cannot be put back into
10030 * 'variable'. The MemoryError is raised when there
10031 * is no value in 'variable', which might (very
10032 * remotely) be a cause of incompatibilities.
10033 */
10034 goto error;
10035 }
10036 /* copy 'right' into the newly allocated area of 'left' */
10037#ifdef Py_DEBUG
10038 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10039 right, 0,
10040 right_len);
10041 assert(0 <= copied);
10042#else
10043 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10044#endif
10045 return;
10046
10047error:
10048 Py_DECREF(*p_left);
10049 *p_left = NULL;
10050}
10051
Walter Dörwald1ab83302007-05-18 17:15:44 +000010052void
Victor Stinner23e56682011-10-03 03:54:37 +020010053PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010054{
Victor Stinner23e56682011-10-03 03:54:37 +020010055 PyObject *left, *res;
10056
10057 if (p_left == NULL) {
10058 if (!PyErr_Occurred())
10059 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010060 return;
10061 }
Victor Stinner23e56682011-10-03 03:54:37 +020010062 left = *p_left;
10063 if (right == NULL || !PyUnicode_Check(left)) {
10064 if (!PyErr_Occurred())
10065 PyErr_BadInternalCall();
10066 goto error;
10067 }
10068
Victor Stinnere1335c72011-10-04 20:53:03 +020010069 if (PyUnicode_READY(left))
10070 goto error;
10071 if (PyUnicode_READY(right))
10072 goto error;
10073
Victor Stinner23e56682011-10-03 03:54:37 +020010074 if (PyUnicode_CheckExact(left) && left != unicode_empty
10075 && PyUnicode_CheckExact(right) && right != unicode_empty
10076 && unicode_resizable(left)
10077 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10078 || _PyUnicode_WSTR(left) != NULL))
10079 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010080 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10081 to change the structure size, but characters are stored just after
10082 the structure, and so it requires to move all charactres which is
10083 not so different than duplicating the string. */
10084 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010085 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010086 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010087 return;
10088 }
10089 }
10090
10091 res = PyUnicode_Concat(left, right);
10092 if (res == NULL)
10093 goto error;
10094 Py_DECREF(left);
10095 *p_left = res;
10096 return;
10097
10098error:
10099 Py_DECREF(*p_left);
10100 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010101}
10102
10103void
10104PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010106 PyUnicode_Append(pleft, right);
10107 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010108}
10109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010110PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010113Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010114string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010115interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116
10117static PyObject *
10118unicode_count(PyUnicodeObject *self, PyObject *args)
10119{
10120 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010121 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010122 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 int kind1, kind2, kind;
10125 void *buf1, *buf2;
10126 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127
Jesus Ceaac451502011-04-20 17:09:23 +020010128 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10129 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 kind1 = PyUnicode_KIND(self);
10133 kind2 = PyUnicode_KIND(substring);
10134 kind = kind1 > kind2 ? kind1 : kind2;
10135 buf1 = PyUnicode_DATA(self);
10136 buf2 = PyUnicode_DATA(substring);
10137 if (kind1 != kind)
10138 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10139 if (!buf1) {
10140 Py_DECREF(substring);
10141 return NULL;
10142 }
10143 if (kind2 != kind)
10144 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10145 if (!buf2) {
10146 Py_DECREF(substring);
10147 if (kind1 != kind) PyMem_Free(buf1);
10148 return NULL;
10149 }
10150 len1 = PyUnicode_GET_LENGTH(self);
10151 len2 = PyUnicode_GET_LENGTH(substring);
10152
10153 ADJUST_INDICES(start, end, len1);
10154 switch(kind) {
10155 case PyUnicode_1BYTE_KIND:
10156 iresult = ucs1lib_count(
10157 ((Py_UCS1*)buf1) + start, end - start,
10158 buf2, len2, PY_SSIZE_T_MAX
10159 );
10160 break;
10161 case PyUnicode_2BYTE_KIND:
10162 iresult = ucs2lib_count(
10163 ((Py_UCS2*)buf1) + start, end - start,
10164 buf2, len2, PY_SSIZE_T_MAX
10165 );
10166 break;
10167 case PyUnicode_4BYTE_KIND:
10168 iresult = ucs4lib_count(
10169 ((Py_UCS4*)buf1) + start, end - start,
10170 buf2, len2, PY_SSIZE_T_MAX
10171 );
10172 break;
10173 default:
10174 assert(0); iresult = 0;
10175 }
10176
10177 result = PyLong_FromSsize_t(iresult);
10178
10179 if (kind1 != kind)
10180 PyMem_Free(buf1);
10181 if (kind2 != kind)
10182 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
10184 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 return result;
10187}
10188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010189PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010190 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010192Encode S using the codec registered for encoding. Default encoding\n\
10193is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010194handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010195a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10196'xmlcharrefreplace' as well as any other name registered with\n\
10197codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
10199static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010200unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010202 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 char *encoding = NULL;
10204 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010205
Benjamin Peterson308d6372009-09-18 21:42:35 +000010206 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10207 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010209 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010210}
10211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010212PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010213 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214\n\
10215Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010216If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217
10218static PyObject*
10219unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10220{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010221 Py_ssize_t i, j, line_pos, src_len, incr;
10222 Py_UCS4 ch;
10223 PyObject *u;
10224 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010226 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010227 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
10229 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
Antoine Pitrou22425222011-10-04 19:10:51 +020010232 if (PyUnicode_READY(self) == -1)
10233 return NULL;
10234
Thomas Wouters7e474022000-07-16 12:04:32 +000010235 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010236 src_len = PyUnicode_GET_LENGTH(self);
10237 i = j = line_pos = 0;
10238 kind = PyUnicode_KIND(self);
10239 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010240 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010241 for (; i < src_len; i++) {
10242 ch = PyUnicode_READ(kind, src_data, i);
10243 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010244 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010245 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010246 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010247 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010248 goto overflow;
10249 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010251 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010255 goto overflow;
10256 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010258 if (ch == '\n' || ch == '\r')
10259 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010261 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010262 if (!found && PyUnicode_CheckExact(self)) {
10263 Py_INCREF((PyObject *) self);
10264 return (PyObject *) self;
10265 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010266
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010268 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269 if (!u)
10270 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010271 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
Antoine Pitroue71d5742011-10-04 15:55:09 +020010273 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274
Antoine Pitroue71d5742011-10-04 15:55:09 +020010275 for (; i < src_len; i++) {
10276 ch = PyUnicode_READ(kind, src_data, i);
10277 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010279 incr = tabsize - (line_pos % tabsize);
10280 line_pos += incr;
10281 while (incr--) {
10282 PyUnicode_WRITE(kind, dest_data, j, ' ');
10283 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010284 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010286 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010287 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010288 line_pos++;
10289 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010290 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010291 if (ch == '\n' || ch == '\r')
10292 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010294 }
10295 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010296#ifndef DONT_MAKE_RESULT_READY
10297 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 Py_DECREF(u);
10299 return NULL;
10300 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010301#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010303
Antoine Pitroue71d5742011-10-04 15:55:09 +020010304 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010305 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010309PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010310 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311\n\
10312Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010313such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314arguments start and end are interpreted as in slice notation.\n\
10315\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010316Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317
10318static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320{
Jesus Ceaac451502011-04-20 17:09:23 +020010321 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010322 Py_ssize_t start;
10323 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
Jesus Ceaac451502011-04-20 17:09:23 +020010326 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10327 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (PyUnicode_READY(self) == -1)
10331 return NULL;
10332 if (PyUnicode_READY(substring) == -1)
10333 return NULL;
10334
10335 result = any_find_slice(
10336 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10337 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010338 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339
10340 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (result == -2)
10343 return NULL;
10344
Christian Heimes217cfd12007-12-02 14:31:20 +000010345 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346}
10347
10348static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010349unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010351 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10352 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355}
10356
Guido van Rossumc2504932007-09-18 19:42:40 +000010357/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010358 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010359static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010360unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Guido van Rossumc2504932007-09-18 19:42:40 +000010362 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010363 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (_PyUnicode_HASH(self) != -1)
10366 return _PyUnicode_HASH(self);
10367 if (PyUnicode_READY(self) == -1)
10368 return -1;
10369 len = PyUnicode_GET_LENGTH(self);
10370
10371 /* The hash function as a macro, gets expanded three times below. */
10372#define HASH(P) \
10373 x = (Py_uhash_t)*P << 7; \
10374 while (--len >= 0) \
10375 x = (1000003*x) ^ (Py_uhash_t)*P++;
10376
10377 switch (PyUnicode_KIND(self)) {
10378 case PyUnicode_1BYTE_KIND: {
10379 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10380 HASH(c);
10381 break;
10382 }
10383 case PyUnicode_2BYTE_KIND: {
10384 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10385 HASH(s);
10386 break;
10387 }
10388 default: {
10389 Py_UCS4 *l;
10390 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10391 "Impossible switch case in unicode_hash");
10392 l = PyUnicode_4BYTE_DATA(self);
10393 HASH(l);
10394 break;
10395 }
10396 }
10397 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10398
Guido van Rossumc2504932007-09-18 19:42:40 +000010399 if (x == -1)
10400 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010402 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010406PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
10411static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010414 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010415 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010416 Py_ssize_t start;
10417 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
Jesus Ceaac451502011-04-20 17:09:23 +020010419 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10420 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (PyUnicode_READY(self) == -1)
10424 return NULL;
10425 if (PyUnicode_READY(substring) == -1)
10426 return NULL;
10427
10428 result = any_find_slice(
10429 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10430 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
10433 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 if (result == -2)
10436 return NULL;
10437
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438 if (result < 0) {
10439 PyErr_SetString(PyExc_ValueError, "substring not found");
10440 return NULL;
10441 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442
Christian Heimes217cfd12007-12-02 14:31:20 +000010443 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444}
10445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010446PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010449Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010450at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
10452static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010453unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 Py_ssize_t i, length;
10456 int kind;
10457 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458 int cased;
10459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (PyUnicode_READY(self) == -1)
10461 return NULL;
10462 length = PyUnicode_GET_LENGTH(self);
10463 kind = PyUnicode_KIND(self);
10464 data = PyUnicode_DATA(self);
10465
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (length == 1)
10468 return PyBool_FromLong(
10469 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010471 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010474
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 for (i = 0; i < length; i++) {
10477 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010478
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10480 return PyBool_FromLong(0);
10481 else if (!cased && Py_UNICODE_ISLOWER(ch))
10482 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010484 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485}
10486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010487PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010490Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010491at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010494unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 Py_ssize_t i, length;
10497 int kind;
10498 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499 int cased;
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (PyUnicode_READY(self) == -1)
10502 return NULL;
10503 length = PyUnicode_GET_LENGTH(self);
10504 kind = PyUnicode_KIND(self);
10505 data = PyUnicode_DATA(self);
10506
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (length == 1)
10509 return PyBool_FromLong(
10510 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010512 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010515
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 for (i = 0; i < length; i++) {
10518 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010519
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10521 return PyBool_FromLong(0);
10522 else if (!cased && Py_UNICODE_ISUPPER(ch))
10523 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010525 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526}
10527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010528PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010529 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010531Return True if S is a titlecased string and there is at least one\n\
10532character in S, i.e. upper- and titlecase characters may only\n\
10533follow uncased characters and lowercase characters only cased ones.\n\
10534Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535
10536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010537unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 Py_ssize_t i, length;
10540 int kind;
10541 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542 int cased, previous_is_cased;
10543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (PyUnicode_READY(self) == -1)
10545 return NULL;
10546 length = PyUnicode_GET_LENGTH(self);
10547 kind = PyUnicode_KIND(self);
10548 data = PyUnicode_DATA(self);
10549
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (length == 1) {
10552 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10553 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10554 (Py_UNICODE_ISUPPER(ch) != 0));
10555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010557 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010559 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010560
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 cased = 0;
10562 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 for (i = 0; i < length; i++) {
10564 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010565
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10567 if (previous_is_cased)
10568 return PyBool_FromLong(0);
10569 previous_is_cased = 1;
10570 cased = 1;
10571 }
10572 else if (Py_UNICODE_ISLOWER(ch)) {
10573 if (!previous_is_cased)
10574 return PyBool_FromLong(0);
10575 previous_is_cased = 1;
10576 cased = 1;
10577 }
10578 else
10579 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010581 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582}
10583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010587Return True if all characters in S are whitespace\n\
10588and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
10590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010591unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 Py_ssize_t i, length;
10594 int kind;
10595 void *data;
10596
10597 if (PyUnicode_READY(self) == -1)
10598 return NULL;
10599 length = PyUnicode_GET_LENGTH(self);
10600 kind = PyUnicode_KIND(self);
10601 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (length == 1)
10605 return PyBool_FromLong(
10606 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010608 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010610 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 for (i = 0; i < length; i++) {
10613 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010614 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010617 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618}
10619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010620PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010622\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010623Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010624and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010625
10626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010627unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 Py_ssize_t i, length;
10630 int kind;
10631 void *data;
10632
10633 if (PyUnicode_READY(self) == -1)
10634 return NULL;
10635 length = PyUnicode_GET_LENGTH(self);
10636 kind = PyUnicode_KIND(self);
10637 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010638
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010639 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (length == 1)
10641 return PyBool_FromLong(
10642 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010643
10644 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 for (i = 0; i < length; i++) {
10649 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010651 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010652 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010653}
10654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010655PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010657\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010658Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010659and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010660
10661static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010662unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 int kind;
10665 void *data;
10666 Py_ssize_t len, i;
10667
10668 if (PyUnicode_READY(self) == -1)
10669 return NULL;
10670
10671 kind = PyUnicode_KIND(self);
10672 data = PyUnicode_DATA(self);
10673 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010674
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010675 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (len == 1) {
10677 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10678 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10679 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010680
10681 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 for (i = 0; i < len; i++) {
10686 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010687 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010690 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010691}
10692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010696Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010697False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698
10699static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010700unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 Py_ssize_t i, length;
10703 int kind;
10704 void *data;
10705
10706 if (PyUnicode_READY(self) == -1)
10707 return NULL;
10708 length = PyUnicode_GET_LENGTH(self);
10709 kind = PyUnicode_KIND(self);
10710 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (length == 1)
10714 return PyBool_FromLong(
10715 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010717 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 for (i = 0; i < length; i++) {
10722 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010725 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726}
10727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010731Return True if all characters in S are digits\n\
10732and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
10734static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010735unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 Py_ssize_t i, length;
10738 int kind;
10739 void *data;
10740
10741 if (PyUnicode_READY(self) == -1)
10742 return NULL;
10743 length = PyUnicode_GET_LENGTH(self);
10744 kind = PyUnicode_KIND(self);
10745 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (length == 1) {
10749 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10750 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010753 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 for (i = 0; i < length; i++) {
10758 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010761 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762}
10763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010764PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010765 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010767Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010768False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769
10770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010771unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 Py_ssize_t i, length;
10774 int kind;
10775 void *data;
10776
10777 if (PyUnicode_READY(self) == -1)
10778 return NULL;
10779 length = PyUnicode_GET_LENGTH(self);
10780 kind = PyUnicode_KIND(self);
10781 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (length == 1)
10785 return PyBool_FromLong(
10786 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010788 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 for (i = 0; i < length; i++) {
10793 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010796 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797}
10798
Martin v. Löwis47383402007-08-15 07:32:56 +000010799int
10800PyUnicode_IsIdentifier(PyObject *self)
10801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 int kind;
10803 void *data;
10804 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010805 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (PyUnicode_READY(self) == -1) {
10808 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 }
10811
10812 /* Special case for empty strings */
10813 if (PyUnicode_GET_LENGTH(self) == 0)
10814 return 0;
10815 kind = PyUnicode_KIND(self);
10816 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010817
10818 /* PEP 3131 says that the first character must be in
10819 XID_Start and subsequent characters in XID_Continue,
10820 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010821 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010822 letters, digits, underscore). However, given the current
10823 definition of XID_Start and XID_Continue, it is sufficient
10824 to check just for these, except that _ must be allowed
10825 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010827 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010828 return 0;
10829
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010830 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010833 return 1;
10834}
10835
10836PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010838\n\
10839Return True if S is a valid identifier according\n\
10840to the language definition.");
10841
10842static PyObject*
10843unicode_isidentifier(PyObject *self)
10844{
10845 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10846}
10847
Georg Brandl559e5d72008-06-11 18:37:52 +000010848PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010850\n\
10851Return True if all characters in S are considered\n\
10852printable in repr() or S is empty, False otherwise.");
10853
10854static PyObject*
10855unicode_isprintable(PyObject *self)
10856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 Py_ssize_t i, length;
10858 int kind;
10859 void *data;
10860
10861 if (PyUnicode_READY(self) == -1)
10862 return NULL;
10863 length = PyUnicode_GET_LENGTH(self);
10864 kind = PyUnicode_KIND(self);
10865 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010866
10867 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 if (length == 1)
10869 return PyBool_FromLong(
10870 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 for (i = 0; i < length; i++) {
10873 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010874 Py_RETURN_FALSE;
10875 }
10876 }
10877 Py_RETURN_TRUE;
10878}
10879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010880PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010881 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882\n\
10883Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010884iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010887unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010889 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890}
10891
Martin v. Löwis18e16552006-02-15 17:27:45 +000010892static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893unicode_length(PyUnicodeObject *self)
10894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (PyUnicode_READY(self) == -1)
10896 return -1;
10897 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898}
10899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010900PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010903Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010904done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
10906static PyObject *
10907unicode_ljust(PyUnicodeObject *self, PyObject *args)
10908{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010909 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 Py_UCS4 fillchar = ' ';
10911
10912 if (PyUnicode_READY(self) == -1)
10913 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010914
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010915 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916 return NULL;
10917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 Py_INCREF(self);
10920 return (PyObject*) self;
10921 }
10922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924}
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010929Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930
10931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010932unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 return fixup(self, fixlower);
10935}
10936
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010937#define LEFTSTRIP 0
10938#define RIGHTSTRIP 1
10939#define BOTHSTRIP 2
10940
10941/* Arrays indexed by above */
10942static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10943
10944#define STRIPNAME(i) (stripformat[i]+3)
10945
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010946/* externally visible for str.strip(unicode) */
10947PyObject *
10948_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 void *data;
10951 int kind;
10952 Py_ssize_t i, j, len;
10953 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10956 return NULL;
10957
10958 kind = PyUnicode_KIND(self);
10959 data = PyUnicode_DATA(self);
10960 len = PyUnicode_GET_LENGTH(self);
10961 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10962 PyUnicode_DATA(sepobj),
10963 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010964
Benjamin Peterson14339b62009-01-31 16:36:08 +000010965 i = 0;
10966 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 while (i < len &&
10968 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 i++;
10970 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010971 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010972
Benjamin Peterson14339b62009-01-31 16:36:08 +000010973 j = len;
10974 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 do {
10976 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 } while (j >= i &&
10978 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010980 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010981
Victor Stinner12bab6d2011-10-01 01:53:49 +020010982 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983}
10984
10985PyObject*
10986PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10987{
10988 unsigned char *data;
10989 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010990 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991
Victor Stinnerde636f32011-10-01 03:55:54 +020010992 if (PyUnicode_READY(self) == -1)
10993 return NULL;
10994
10995 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10996
Victor Stinner12bab6d2011-10-01 01:53:49 +020010997 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010999 if (PyUnicode_CheckExact(self)) {
11000 Py_INCREF(self);
11001 return self;
11002 }
11003 else
11004 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 }
11006
Victor Stinner12bab6d2011-10-01 01:53:49 +020011007 length = end - start;
11008 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011009 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010
Victor Stinnerde636f32011-10-01 03:55:54 +020011011 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011012 PyErr_SetString(PyExc_IndexError, "string index out of range");
11013 return NULL;
11014 }
11015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 kind = PyUnicode_KIND(self);
11017 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020011018 return PyUnicode_FromKindAndData(kind,
11019 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020011020 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
11023static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011024do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 int kind;
11027 void *data;
11028 Py_ssize_t len, i, j;
11029
11030 if (PyUnicode_READY(self) == -1)
11031 return NULL;
11032
11033 kind = PyUnicode_KIND(self);
11034 data = PyUnicode_DATA(self);
11035 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011036
Benjamin Peterson14339b62009-01-31 16:36:08 +000011037 i = 0;
11038 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011040 i++;
11041 }
11042 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011043
Benjamin Peterson14339b62009-01-31 16:36:08 +000011044 j = len;
11045 if (striptype != LEFTSTRIP) {
11046 do {
11047 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011049 j++;
11050 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011051
Victor Stinner12bab6d2011-10-01 01:53:49 +020011052 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053}
11054
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011055
11056static PyObject *
11057do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11058{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011059 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011060
Benjamin Peterson14339b62009-01-31 16:36:08 +000011061 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11062 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011063
Benjamin Peterson14339b62009-01-31 16:36:08 +000011064 if (sep != NULL && sep != Py_None) {
11065 if (PyUnicode_Check(sep))
11066 return _PyUnicode_XStrip(self, striptype, sep);
11067 else {
11068 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011069 "%s arg must be None or str",
11070 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011071 return NULL;
11072 }
11073 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011074
Benjamin Peterson14339b62009-01-31 16:36:08 +000011075 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011076}
11077
11078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011081\n\
11082Return a copy of the string S with leading and trailing\n\
11083whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011084If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011085
11086static PyObject *
11087unicode_strip(PyUnicodeObject *self, PyObject *args)
11088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011089 if (PyTuple_GET_SIZE(args) == 0)
11090 return do_strip(self, BOTHSTRIP); /* Common case */
11091 else
11092 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011093}
11094
11095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011098\n\
11099Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011100If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011101
11102static PyObject *
11103unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011105 if (PyTuple_GET_SIZE(args) == 0)
11106 return do_strip(self, LEFTSTRIP); /* Common case */
11107 else
11108 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011109}
11110
11111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011112PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011114\n\
11115Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011116If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011117
11118static PyObject *
11119unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011121 if (PyTuple_GET_SIZE(args) == 0)
11122 return do_strip(self, RIGHTSTRIP); /* Common case */
11123 else
11124 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011125}
11126
11127
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011129unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130{
11131 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
Georg Brandl222de0f2009-04-12 12:01:50 +000011134 if (len < 1) {
11135 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011136 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
Tim Peters7a29bd52001-09-12 03:03:31 +000011139 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 /* no repeat, return original string */
11141 Py_INCREF(str);
11142 return (PyObject*) str;
11143 }
Tim Peters8f422462000-09-09 06:13:41 +000011144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 if (PyUnicode_READY(str) == -1)
11146 return NULL;
11147
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011148 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011149 PyErr_SetString(PyExc_OverflowError,
11150 "repeated string is too long");
11151 return NULL;
11152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 if (!u)
11157 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011158 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (PyUnicode_GET_LENGTH(str) == 1) {
11161 const int kind = PyUnicode_KIND(str);
11162 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11163 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011164 if (kind == PyUnicode_1BYTE_KIND)
11165 memset(to, (unsigned char)fill_char, len);
11166 else {
11167 for (n = 0; n < len; ++n)
11168 PyUnicode_WRITE(kind, to, n, fill_char);
11169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 }
11171 else {
11172 /* number of characters copied this far */
11173 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11174 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11175 char *to = (char *) PyUnicode_DATA(u);
11176 Py_MEMCPY(to, PyUnicode_DATA(str),
11177 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 n = (done <= nchars-done) ? done : nchars-done;
11180 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011181 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 }
11184
11185 return (PyObject*) u;
11186}
11187
Alexander Belopolsky40018472011-02-26 01:02:56 +000011188PyObject *
11189PyUnicode_Replace(PyObject *obj,
11190 PyObject *subobj,
11191 PyObject *replobj,
11192 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193{
11194 PyObject *self;
11195 PyObject *str1;
11196 PyObject *str2;
11197 PyObject *result;
11198
11199 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011200 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011203 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 Py_DECREF(self);
11205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 }
11207 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011208 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 Py_DECREF(self);
11210 Py_DECREF(str1);
11211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 Py_DECREF(self);
11215 Py_DECREF(str1);
11216 Py_DECREF(str2);
11217 return result;
11218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011221 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
11223Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011224old replaced by new. If the optional argument count is\n\
11225given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
11227static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 PyObject *str1;
11231 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011232 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233 PyObject *result;
11234
Martin v. Löwis18e16552006-02-15 17:27:45 +000011235 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 str1 = PyUnicode_FromObject(str1);
11240 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11241 return NULL;
11242 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011243 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 Py_DECREF(str1);
11245 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
11248 result = replace(self, str1, str2, maxcount);
11249
11250 Py_DECREF(str1);
11251 Py_DECREF(str2);
11252 return result;
11253}
11254
Alexander Belopolsky40018472011-02-26 01:02:56 +000011255static PyObject *
11256unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011258 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 Py_ssize_t isize;
11260 Py_ssize_t osize, squote, dquote, i, o;
11261 Py_UCS4 max, quote;
11262 int ikind, okind;
11263 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011266 return NULL;
11267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 isize = PyUnicode_GET_LENGTH(unicode);
11269 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 /* Compute length of output, quote characters, and
11272 maximum character */
11273 osize = 2; /* quotes */
11274 max = 127;
11275 squote = dquote = 0;
11276 ikind = PyUnicode_KIND(unicode);
11277 for (i = 0; i < isize; i++) {
11278 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11279 switch (ch) {
11280 case '\'': squote++; osize++; break;
11281 case '"': dquote++; osize++; break;
11282 case '\\': case '\t': case '\r': case '\n':
11283 osize += 2; break;
11284 default:
11285 /* Fast-path ASCII */
11286 if (ch < ' ' || ch == 0x7f)
11287 osize += 4; /* \xHH */
11288 else if (ch < 0x7f)
11289 osize++;
11290 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11291 osize++;
11292 max = ch > max ? ch : max;
11293 }
11294 else if (ch < 0x100)
11295 osize += 4; /* \xHH */
11296 else if (ch < 0x10000)
11297 osize += 6; /* \uHHHH */
11298 else
11299 osize += 10; /* \uHHHHHHHH */
11300 }
11301 }
11302
11303 quote = '\'';
11304 if (squote) {
11305 if (dquote)
11306 /* Both squote and dquote present. Use squote,
11307 and escape them */
11308 osize += squote;
11309 else
11310 quote = '"';
11311 }
11312
11313 repr = PyUnicode_New(osize, max);
11314 if (repr == NULL)
11315 return NULL;
11316 okind = PyUnicode_KIND(repr);
11317 odata = PyUnicode_DATA(repr);
11318
11319 PyUnicode_WRITE(okind, odata, 0, quote);
11320 PyUnicode_WRITE(okind, odata, osize-1, quote);
11321
11322 for (i = 0, o = 1; i < isize; i++) {
11323 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011324
11325 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if ((ch == quote) || (ch == '\\')) {
11327 PyUnicode_WRITE(okind, odata, o++, '\\');
11328 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011329 continue;
11330 }
11331
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011333 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 PyUnicode_WRITE(okind, odata, o++, '\\');
11335 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011336 }
11337 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 PyUnicode_WRITE(okind, odata, o++, '\\');
11339 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011340 }
11341 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 PyUnicode_WRITE(okind, odata, o++, '\\');
11343 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011344 }
11345
11346 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011347 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 PyUnicode_WRITE(okind, odata, o++, '\\');
11349 PyUnicode_WRITE(okind, odata, o++, 'x');
11350 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11351 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011352 }
11353
Georg Brandl559e5d72008-06-11 18:37:52 +000011354 /* Copy ASCII characters as-is */
11355 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011357 }
11358
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011360 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011361 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011362 (categories Z* and C* except ASCII space)
11363 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011365 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 if (ch <= 0xff) {
11367 PyUnicode_WRITE(okind, odata, o++, '\\');
11368 PyUnicode_WRITE(okind, odata, o++, 'x');
11369 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11370 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011371 }
11372 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 else if (ch >= 0x10000) {
11374 PyUnicode_WRITE(okind, odata, o++, '\\');
11375 PyUnicode_WRITE(okind, odata, o++, 'U');
11376 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11377 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11378 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11379 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11380 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11381 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11382 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11383 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011384 }
11385 /* Map 16-bit characters to '\uxxxx' */
11386 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 PyUnicode_WRITE(okind, odata, o++, '\\');
11388 PyUnicode_WRITE(okind, odata, o++, 'u');
11389 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11390 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11391 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11392 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011393 }
11394 }
11395 /* Copy characters as-is */
11396 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011398 }
11399 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011402 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403}
11404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011406 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407\n\
11408Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011409such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410arguments start and end are interpreted as in slice notation.\n\
11411\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416{
Jesus Ceaac451502011-04-20 17:09:23 +020011417 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011418 Py_ssize_t start;
11419 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011420 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Jesus Ceaac451502011-04-20 17:09:23 +020011422 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11423 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (PyUnicode_READY(self) == -1)
11427 return NULL;
11428 if (PyUnicode_READY(substring) == -1)
11429 return NULL;
11430
11431 result = any_find_slice(
11432 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11433 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011434 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 if (result == -2)
11439 return NULL;
11440
Christian Heimes217cfd12007-12-02 14:31:20 +000011441 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442}
11443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011444PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011447Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Jesus Ceaac451502011-04-20 17:09:23 +020011452 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011453 Py_ssize_t start;
11454 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Jesus Ceaac451502011-04-20 17:09:23 +020011457 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11458 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (PyUnicode_READY(self) == -1)
11462 return NULL;
11463 if (PyUnicode_READY(substring) == -1)
11464 return NULL;
11465
11466 result = any_find_slice(
11467 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11468 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011469 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
11471 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 if (result == -2)
11474 return NULL;
11475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 if (result < 0) {
11477 PyErr_SetString(PyExc_ValueError, "substring not found");
11478 return NULL;
11479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480
Christian Heimes217cfd12007-12-02 14:31:20 +000011481 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011487Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011488done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
11490static PyObject *
11491unicode_rjust(PyUnicodeObject *self, PyObject *args)
11492{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011493 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 Py_UCS4 fillchar = ' ';
11495
Victor Stinnere9a29352011-10-01 02:14:59 +020011496 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011498
Victor Stinnere9a29352011-10-01 02:14:59 +020011499 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 return NULL;
11501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 Py_INCREF(self);
11504 return (PyObject*) self;
11505 }
11506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508}
11509
Alexander Belopolsky40018472011-02-26 01:02:56 +000011510PyObject *
11511PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
11513 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011514
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 s = PyUnicode_FromObject(s);
11516 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011517 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 if (sep != NULL) {
11519 sep = PyUnicode_FromObject(sep);
11520 if (sep == NULL) {
11521 Py_DECREF(s);
11522 return NULL;
11523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 }
11525
11526 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11527
11528 Py_DECREF(s);
11529 Py_XDECREF(sep);
11530 return result;
11531}
11532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535\n\
11536Return a list of the words in S, using sep as the\n\
11537delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011538splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011539whitespace string is a separator and empty strings are\n\
11540removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
11542static PyObject*
11543unicode_split(PyUnicodeObject *self, PyObject *args)
11544{
11545 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011546 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Martin v. Löwis18e16552006-02-15 17:27:45 +000011548 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549 return NULL;
11550
11551 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557}
11558
Thomas Wouters477c8d52006-05-27 19:21:47 +000011559PyObject *
11560PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11561{
11562 PyObject* str_obj;
11563 PyObject* sep_obj;
11564 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 int kind1, kind2, kind;
11566 void *buf1 = NULL, *buf2 = NULL;
11567 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011568
11569 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011570 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011572 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011574 Py_DECREF(str_obj);
11575 return NULL;
11576 }
11577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 kind1 = PyUnicode_KIND(str_in);
11579 kind2 = PyUnicode_KIND(sep_obj);
11580 kind = kind1 > kind2 ? kind1 : kind2;
11581 buf1 = PyUnicode_DATA(str_in);
11582 if (kind1 != kind)
11583 buf1 = _PyUnicode_AsKind(str_in, kind);
11584 if (!buf1)
11585 goto onError;
11586 buf2 = PyUnicode_DATA(sep_obj);
11587 if (kind2 != kind)
11588 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11589 if (!buf2)
11590 goto onError;
11591 len1 = PyUnicode_GET_LENGTH(str_obj);
11592 len2 = PyUnicode_GET_LENGTH(sep_obj);
11593
11594 switch(PyUnicode_KIND(str_in)) {
11595 case PyUnicode_1BYTE_KIND:
11596 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11597 break;
11598 case PyUnicode_2BYTE_KIND:
11599 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11600 break;
11601 case PyUnicode_4BYTE_KIND:
11602 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11603 break;
11604 default:
11605 assert(0);
11606 out = 0;
11607 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011608
11609 Py_DECREF(sep_obj);
11610 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 if (kind1 != kind)
11612 PyMem_Free(buf1);
11613 if (kind2 != kind)
11614 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615
11616 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 onError:
11618 Py_DECREF(sep_obj);
11619 Py_DECREF(str_obj);
11620 if (kind1 != kind && buf1)
11621 PyMem_Free(buf1);
11622 if (kind2 != kind && buf2)
11623 PyMem_Free(buf2);
11624 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625}
11626
11627
11628PyObject *
11629PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11630{
11631 PyObject* str_obj;
11632 PyObject* sep_obj;
11633 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 int kind1, kind2, kind;
11635 void *buf1 = NULL, *buf2 = NULL;
11636 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637
11638 str_obj = PyUnicode_FromObject(str_in);
11639 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011641 sep_obj = PyUnicode_FromObject(sep_in);
11642 if (!sep_obj) {
11643 Py_DECREF(str_obj);
11644 return NULL;
11645 }
11646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 kind1 = PyUnicode_KIND(str_in);
11648 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011649 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 buf1 = PyUnicode_DATA(str_in);
11651 if (kind1 != kind)
11652 buf1 = _PyUnicode_AsKind(str_in, kind);
11653 if (!buf1)
11654 goto onError;
11655 buf2 = PyUnicode_DATA(sep_obj);
11656 if (kind2 != kind)
11657 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11658 if (!buf2)
11659 goto onError;
11660 len1 = PyUnicode_GET_LENGTH(str_obj);
11661 len2 = PyUnicode_GET_LENGTH(sep_obj);
11662
11663 switch(PyUnicode_KIND(str_in)) {
11664 case PyUnicode_1BYTE_KIND:
11665 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11666 break;
11667 case PyUnicode_2BYTE_KIND:
11668 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11669 break;
11670 case PyUnicode_4BYTE_KIND:
11671 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11672 break;
11673 default:
11674 assert(0);
11675 out = 0;
11676 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011677
11678 Py_DECREF(sep_obj);
11679 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (kind1 != kind)
11681 PyMem_Free(buf1);
11682 if (kind2 != kind)
11683 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011684
11685 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 onError:
11687 Py_DECREF(sep_obj);
11688 Py_DECREF(str_obj);
11689 if (kind1 != kind && buf1)
11690 PyMem_Free(buf1);
11691 if (kind2 != kind && buf2)
11692 PyMem_Free(buf2);
11693 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011694}
11695
11696PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011698\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011699Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011700the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011701found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011702
11703static PyObject*
11704unicode_partition(PyUnicodeObject *self, PyObject *separator)
11705{
11706 return PyUnicode_Partition((PyObject *)self, separator);
11707}
11708
11709PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011710 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011711\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011712Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011713the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011714separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011715
11716static PyObject*
11717unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11718{
11719 return PyUnicode_RPartition((PyObject *)self, separator);
11720}
11721
Alexander Belopolsky40018472011-02-26 01:02:56 +000011722PyObject *
11723PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011724{
11725 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011726
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011727 s = PyUnicode_FromObject(s);
11728 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011729 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 if (sep != NULL) {
11731 sep = PyUnicode_FromObject(sep);
11732 if (sep == NULL) {
11733 Py_DECREF(s);
11734 return NULL;
11735 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011736 }
11737
11738 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11739
11740 Py_DECREF(s);
11741 Py_XDECREF(sep);
11742 return result;
11743}
11744
11745PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011747\n\
11748Return a list of the words in S, using sep as the\n\
11749delimiter string, starting at the end of the string and\n\
11750working to the front. If maxsplit is given, at most maxsplit\n\
11751splits are done. If sep is not specified, any whitespace string\n\
11752is a separator.");
11753
11754static PyObject*
11755unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11756{
11757 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011758 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011759
Martin v. Löwis18e16552006-02-15 17:27:45 +000011760 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011761 return NULL;
11762
11763 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011765 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011767 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011769}
11770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011771PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773\n\
11774Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011775Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
11778static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011779unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011781 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011782 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011784 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11785 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 return NULL;
11787
Guido van Rossum86662912000-04-11 15:38:46 +000011788 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789}
11790
11791static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011792PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793{
Walter Dörwald346737f2007-05-31 10:44:43 +000011794 if (PyUnicode_CheckExact(self)) {
11795 Py_INCREF(self);
11796 return self;
11797 } else
11798 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011799 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800}
11801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011802PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804\n\
11805Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
11808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011809unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 return fixup(self, fixswapcase);
11812}
11813
Georg Brandlceee0772007-11-27 23:48:05 +000011814PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011816\n\
11817Return a translation table usable for str.translate().\n\
11818If there is only one argument, it must be a dictionary mapping Unicode\n\
11819ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011820Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011821If there are two arguments, they must be strings of equal length, and\n\
11822in the resulting dictionary, each character in x will be mapped to the\n\
11823character at the same position in y. If there is a third argument, it\n\
11824must be a string, whose characters will be mapped to None in the result.");
11825
11826static PyObject*
11827unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11828{
11829 PyObject *x, *y = NULL, *z = NULL;
11830 PyObject *new = NULL, *key, *value;
11831 Py_ssize_t i = 0;
11832 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833
Georg Brandlceee0772007-11-27 23:48:05 +000011834 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11835 return NULL;
11836 new = PyDict_New();
11837 if (!new)
11838 return NULL;
11839 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 int x_kind, y_kind, z_kind;
11841 void *x_data, *y_data, *z_data;
11842
Georg Brandlceee0772007-11-27 23:48:05 +000011843 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011844 if (!PyUnicode_Check(x)) {
11845 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11846 "be a string if there is a second argument");
11847 goto err;
11848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011850 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11851 "arguments must have equal length");
11852 goto err;
11853 }
11854 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 x_kind = PyUnicode_KIND(x);
11856 y_kind = PyUnicode_KIND(y);
11857 x_data = PyUnicode_DATA(x);
11858 y_data = PyUnicode_DATA(y);
11859 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11860 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11861 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011862 if (!key || !value)
11863 goto err;
11864 res = PyDict_SetItem(new, key, value);
11865 Py_DECREF(key);
11866 Py_DECREF(value);
11867 if (res < 0)
11868 goto err;
11869 }
11870 /* create entries for deleting chars in z */
11871 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 z_kind = PyUnicode_KIND(z);
11873 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011874 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011876 if (!key)
11877 goto err;
11878 res = PyDict_SetItem(new, key, Py_None);
11879 Py_DECREF(key);
11880 if (res < 0)
11881 goto err;
11882 }
11883 }
11884 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 int kind;
11886 void *data;
11887
Georg Brandlceee0772007-11-27 23:48:05 +000011888 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011889 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011890 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11891 "to maketrans it must be a dict");
11892 goto err;
11893 }
11894 /* copy entries into the new dict, converting string keys to int keys */
11895 while (PyDict_Next(x, &i, &key, &value)) {
11896 if (PyUnicode_Check(key)) {
11897 /* convert string keys to integer keys */
11898 PyObject *newkey;
11899 if (PyUnicode_GET_SIZE(key) != 1) {
11900 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11901 "table must be of length 1");
11902 goto err;
11903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 kind = PyUnicode_KIND(key);
11905 data = PyUnicode_DATA(key);
11906 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011907 if (!newkey)
11908 goto err;
11909 res = PyDict_SetItem(new, newkey, value);
11910 Py_DECREF(newkey);
11911 if (res < 0)
11912 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011913 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011914 /* just keep integer keys */
11915 if (PyDict_SetItem(new, key, value) < 0)
11916 goto err;
11917 } else {
11918 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11919 "be strings or integers");
11920 goto err;
11921 }
11922 }
11923 }
11924 return new;
11925 err:
11926 Py_DECREF(new);
11927 return NULL;
11928}
11929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011930PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932\n\
11933Return a copy of the string S, where all characters have been mapped\n\
11934through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011935Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011936Unmapped characters are left untouched. Characters mapped to None\n\
11937are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
11939static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943}
11944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
11950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011951unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953 return fixup(self, fixupper);
11954}
11955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011959Pad a numeric string S with zeros on the left, to fill a field\n\
11960of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
11962static PyObject *
11963unicode_zfill(PyUnicodeObject *self, PyObject *args)
11964{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011965 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011967 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 int kind;
11969 void *data;
11970 Py_UCS4 chr;
11971
11972 if (PyUnicode_READY(self) == -1)
11973 return NULL;
11974
Martin v. Löwis18e16552006-02-15 17:27:45 +000011975 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 return NULL;
11977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011979 if (PyUnicode_CheckExact(self)) {
11980 Py_INCREF(self);
11981 return (PyObject*) self;
11982 }
11983 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011984 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 }
11986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
11989 u = pad(self, fill, 0, '0');
11990
Walter Dörwald068325e2002-04-15 13:36:47 +000011991 if (u == NULL)
11992 return NULL;
11993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 kind = PyUnicode_KIND(u);
11995 data = PyUnicode_DATA(u);
11996 chr = PyUnicode_READ(kind, data, fill);
11997
11998 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 PyUnicode_WRITE(kind, data, 0, chr);
12001 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 }
12003
12004 return (PyObject*) u;
12005}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
12007#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012008static PyObject *
12009unicode__decimal2ascii(PyObject *self)
12010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012012}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013#endif
12014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012015PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012018Return True if S starts with the specified prefix, False otherwise.\n\
12019With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012020With optional end, stop comparing S at that position.\n\
12021prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
12023static PyObject *
12024unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012027 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012029 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012030 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012031 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
Jesus Ceaac451502011-04-20 17:09:23 +020012033 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012035 if (PyTuple_Check(subobj)) {
12036 Py_ssize_t i;
12037 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12038 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012040 if (substring == NULL)
12041 return NULL;
12042 result = tailmatch(self, substring, start, end, -1);
12043 Py_DECREF(substring);
12044 if (result) {
12045 Py_RETURN_TRUE;
12046 }
12047 }
12048 /* nothing matched */
12049 Py_RETURN_FALSE;
12050 }
12051 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012052 if (substring == NULL) {
12053 if (PyErr_ExceptionMatches(PyExc_TypeError))
12054 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12055 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012057 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012058 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012060 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
12063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012064PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012067Return True if S ends with the specified suffix, False otherwise.\n\
12068With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012069With optional end, stop comparing S at that position.\n\
12070suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071
12072static PyObject *
12073unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012076 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012078 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012079 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012080 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Jesus Ceaac451502011-04-20 17:09:23 +020012082 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012084 if (PyTuple_Check(subobj)) {
12085 Py_ssize_t i;
12086 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12087 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012089 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012091 result = tailmatch(self, substring, start, end, +1);
12092 Py_DECREF(substring);
12093 if (result) {
12094 Py_RETURN_TRUE;
12095 }
12096 }
12097 Py_RETURN_FALSE;
12098 }
12099 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012100 if (substring == NULL) {
12101 if (PyErr_ExceptionMatches(PyExc_TypeError))
12102 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12103 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012105 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012106 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012108 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109}
12110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012112
12113PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012115\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012116Return a formatted version of S, using substitutions from args and kwargs.\n\
12117The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012118
Eric Smith27bbca62010-11-04 17:06:58 +000012119PyDoc_STRVAR(format_map__doc__,
12120 "S.format_map(mapping) -> str\n\
12121\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012122Return a formatted version of S, using substitutions from mapping.\n\
12123The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012124
Eric Smith4a7d76d2008-05-30 18:10:19 +000012125static PyObject *
12126unicode__format__(PyObject* self, PyObject* args)
12127{
12128 PyObject *format_spec;
12129
12130 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12131 return NULL;
12132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12134 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012135}
12136
Eric Smith8c663262007-08-25 02:26:07 +000012137PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012139\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012140Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012141
12142static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012143unicode__sizeof__(PyUnicodeObject *v)
12144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 Py_ssize_t size;
12146
12147 /* If it's a compact object, account for base structure +
12148 character data. */
12149 if (PyUnicode_IS_COMPACT_ASCII(v))
12150 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12151 else if (PyUnicode_IS_COMPACT(v))
12152 size = sizeof(PyCompactUnicodeObject) +
12153 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12154 else {
12155 /* If it is a two-block object, account for base object, and
12156 for character block if present. */
12157 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012158 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 size += (PyUnicode_GET_LENGTH(v) + 1) *
12160 PyUnicode_CHARACTER_SIZE(v);
12161 }
12162 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012163 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012164 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012166 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012167 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168
12169 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012170}
12171
12172PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012174
12175static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012176unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012177{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012178 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (!copy)
12180 return NULL;
12181 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012182}
12183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184static PyMethodDef unicode_methods[] = {
12185
12186 /* Order is according to common usage: often used methods should
12187 appear first, since lookup is done sequentially. */
12188
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012189 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012190 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12191 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012192 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012193 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12194 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12195 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12196 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12197 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12198 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12199 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012200 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012201 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12202 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12203 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012204 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012205 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12206 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12207 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012208 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012209 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012210 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012211 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012212 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12213 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12214 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12215 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12216 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12217 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12218 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12219 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12220 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12221 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12222 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12223 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12224 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12225 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012226 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012227 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012228 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012229 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012230 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012231 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012232 {"maketrans", (PyCFunction) unicode_maketrans,
12233 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012234 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012235#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012236 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237#endif
12238
12239#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012240 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012241 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242#endif
12243
Benjamin Peterson14339b62009-01-31 16:36:08 +000012244 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 {NULL, NULL}
12246};
12247
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012248static PyObject *
12249unicode_mod(PyObject *v, PyObject *w)
12250{
Brian Curtindfc80e32011-08-10 20:28:54 -050012251 if (!PyUnicode_Check(v))
12252 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012254}
12255
12256static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012257 0, /*nb_add*/
12258 0, /*nb_subtract*/
12259 0, /*nb_multiply*/
12260 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012261};
12262
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012264 (lenfunc) unicode_length, /* sq_length */
12265 PyUnicode_Concat, /* sq_concat */
12266 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12267 (ssizeargfunc) unicode_getitem, /* sq_item */
12268 0, /* sq_slice */
12269 0, /* sq_ass_item */
12270 0, /* sq_ass_slice */
12271 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272};
12273
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012274static PyObject*
12275unicode_subscript(PyUnicodeObject* self, PyObject* item)
12276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012280 if (PyIndex_Check(item)) {
12281 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012282 if (i == -1 && PyErr_Occurred())
12283 return NULL;
12284 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012286 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012287 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012288 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012289 PyObject *result;
12290 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012291 int src_kind, dest_kind;
12292 Py_UCS4 ch, max_char;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012296 return NULL;
12297 }
12298
12299 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 return PyUnicode_New(0, 0);
12301 } else if (start == 0 && step == 1 &&
12302 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012303 PyUnicode_CheckExact(self)) {
12304 Py_INCREF(self);
12305 return (PyObject *)self;
12306 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012307 return PyUnicode_Substring((PyObject*)self,
12308 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012309 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012310 /* General case */
12311 max_char = 127;
12312 src_kind = PyUnicode_KIND(self);
12313 src_data = PyUnicode_DATA(self);
12314 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12315 ch = PyUnicode_READ(src_kind, src_data, cur);
12316 if (ch > max_char)
12317 max_char = ch;
12318 }
12319 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012320 if (result == NULL)
12321 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012322 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012323 dest_data = PyUnicode_DATA(result);
12324
12325 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012326 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12327 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012328 }
12329 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012330 } else {
12331 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12332 return NULL;
12333 }
12334}
12335
12336static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012337 (lenfunc)unicode_length, /* mp_length */
12338 (binaryfunc)unicode_subscript, /* mp_subscript */
12339 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012340};
12341
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343/* Helpers for PyUnicode_Format() */
12344
12345static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012346getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012348 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 (*p_argidx)++;
12351 if (arglen < 0)
12352 return args;
12353 else
12354 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 }
12356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 return NULL;
12359}
12360
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012361/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012363static PyObject *
12364formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012366 char *p;
12367 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012369
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370 x = PyFloat_AsDouble(v);
12371 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012372 return NULL;
12373
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012376
Eric Smith0923d1d2009-04-16 20:16:10 +000012377 p = PyOS_double_to_string(x, type, prec,
12378 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012379 if (p == NULL)
12380 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012382 PyMem_Free(p);
12383 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384}
12385
Tim Peters38fd5b62000-09-21 05:43:11 +000012386static PyObject*
12387formatlong(PyObject *val, int flags, int prec, int type)
12388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 char *buf;
12390 int len;
12391 PyObject *str; /* temporary string object. */
12392 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012393
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12395 if (!str)
12396 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 Py_DECREF(str);
12399 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012400}
12401
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012404 size_t buflen,
12405 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012407 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012408 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 if (PyUnicode_GET_LENGTH(v) == 1) {
12410 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 buf[1] = '\0';
12412 return 1;
12413 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 goto onError;
12415 }
12416 else {
12417 /* Integer input truncated to a character */
12418 long x;
12419 x = PyLong_AsLong(v);
12420 if (x == -1 && PyErr_Occurred())
12421 goto onError;
12422
12423 if (x < 0 || x > 0x10ffff) {
12424 PyErr_SetString(PyExc_OverflowError,
12425 "%c arg not in range(0x110000)");
12426 return -1;
12427 }
12428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 buf[1] = '\0';
12431 return 1;
12432 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012433
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012435 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012437 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438}
12439
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012440/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012441 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012442*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012443#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012444
Alexander Belopolsky40018472011-02-26 01:02:56 +000012445PyObject *
12446PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 void *fmt;
12449 int fmtkind;
12450 PyObject *result;
12451 Py_UCS4 *res, *res0;
12452 Py_UCS4 max;
12453 int kind;
12454 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012458
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 PyErr_BadInternalCall();
12461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12464 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 fmt = PyUnicode_DATA(uformat);
12467 fmtkind = PyUnicode_KIND(uformat);
12468 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12469 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
12471 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12473 if (res0 == NULL) {
12474 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 arglen = PyTuple_Size(args);
12480 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481 }
12482 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 arglen = -1;
12484 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012486 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012487 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
12490 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 if (--rescnt < 0) {
12493 rescnt = fmtcnt + 100;
12494 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12496 if (res0 == NULL){
12497 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 }
12500 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012504 }
12505 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 /* Got a format specifier */
12507 int flags = 0;
12508 Py_ssize_t width = -1;
12509 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 Py_UCS4 c = '\0';
12511 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 int isnumok;
12513 PyObject *v = NULL;
12514 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 void *pbuf;
12516 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 Py_ssize_t len, len1;
12519 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 fmtpos++;
12522 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12523 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 Py_ssize_t keylen;
12525 PyObject *key;
12526 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012527
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 if (dict == NULL) {
12529 PyErr_SetString(PyExc_TypeError,
12530 "format requires a mapping");
12531 goto onError;
12532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 /* Skip over balanced parentheses */
12537 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 if (fmtcnt < 0 || pcount > 0) {
12546 PyErr_SetString(PyExc_ValueError,
12547 "incomplete format key");
12548 goto onError;
12549 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012550 key = PyUnicode_Substring((PyObject*)uformat,
12551 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 if (key == NULL)
12553 goto onError;
12554 if (args_owned) {
12555 Py_DECREF(args);
12556 args_owned = 0;
12557 }
12558 args = PyObject_GetItem(dict, key);
12559 Py_DECREF(key);
12560 if (args == NULL) {
12561 goto onError;
12562 }
12563 args_owned = 1;
12564 arglen = -1;
12565 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012566 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 case '-': flags |= F_LJUST; continue;
12570 case '+': flags |= F_SIGN; continue;
12571 case ' ': flags |= F_BLANK; continue;
12572 case '#': flags |= F_ALT; continue;
12573 case '0': flags |= F_ZERO; continue;
12574 }
12575 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012576 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 if (c == '*') {
12578 v = getnextarg(args, arglen, &argidx);
12579 if (v == NULL)
12580 goto onError;
12581 if (!PyLong_Check(v)) {
12582 PyErr_SetString(PyExc_TypeError,
12583 "* wants int");
12584 goto onError;
12585 }
12586 width = PyLong_AsLong(v);
12587 if (width == -1 && PyErr_Occurred())
12588 goto onError;
12589 if (width < 0) {
12590 flags |= F_LJUST;
12591 width = -width;
12592 }
12593 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 }
12596 else if (c >= '0' && c <= '9') {
12597 width = c - '0';
12598 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 if (c < '0' || c > '9')
12601 break;
12602 if ((width*10) / 10 != width) {
12603 PyErr_SetString(PyExc_ValueError,
12604 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012605 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 }
12607 width = width*10 + (c - '0');
12608 }
12609 }
12610 if (c == '.') {
12611 prec = 0;
12612 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 if (c == '*') {
12615 v = getnextarg(args, arglen, &argidx);
12616 if (v == NULL)
12617 goto onError;
12618 if (!PyLong_Check(v)) {
12619 PyErr_SetString(PyExc_TypeError,
12620 "* wants int");
12621 goto onError;
12622 }
12623 prec = PyLong_AsLong(v);
12624 if (prec == -1 && PyErr_Occurred())
12625 goto onError;
12626 if (prec < 0)
12627 prec = 0;
12628 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 }
12631 else if (c >= '0' && c <= '9') {
12632 prec = c - '0';
12633 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 if (c < '0' || c > '9')
12636 break;
12637 if ((prec*10) / 10 != prec) {
12638 PyErr_SetString(PyExc_ValueError,
12639 "prec too big");
12640 goto onError;
12641 }
12642 prec = prec*10 + (c - '0');
12643 }
12644 }
12645 } /* prec */
12646 if (fmtcnt >= 0) {
12647 if (c == 'h' || c == 'l' || c == 'L') {
12648 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 }
12651 }
12652 if (fmtcnt < 0) {
12653 PyErr_SetString(PyExc_ValueError,
12654 "incomplete format");
12655 goto onError;
12656 }
12657 if (c != '%') {
12658 v = getnextarg(args, arglen, &argidx);
12659 if (v == NULL)
12660 goto onError;
12661 }
12662 sign = 0;
12663 fill = ' ';
12664 switch (c) {
12665
12666 case '%':
12667 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 len = 1;
12672 break;
12673
12674 case 's':
12675 case 'r':
12676 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012677 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 temp = v;
12679 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 }
12681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 if (c == 's')
12683 temp = PyObject_Str(v);
12684 else if (c == 'r')
12685 temp = PyObject_Repr(v);
12686 else
12687 temp = PyObject_ASCII(v);
12688 if (temp == NULL)
12689 goto onError;
12690 if (PyUnicode_Check(temp))
12691 /* nothing to do */;
12692 else {
12693 Py_DECREF(temp);
12694 PyErr_SetString(PyExc_TypeError,
12695 "%s argument has non-string str()");
12696 goto onError;
12697 }
12698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (PyUnicode_READY(temp) == -1) {
12700 Py_CLEAR(temp);
12701 goto onError;
12702 }
12703 pbuf = PyUnicode_DATA(temp);
12704 kind = PyUnicode_KIND(temp);
12705 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 if (prec >= 0 && len > prec)
12707 len = prec;
12708 break;
12709
12710 case 'i':
12711 case 'd':
12712 case 'u':
12713 case 'o':
12714 case 'x':
12715 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 isnumok = 0;
12717 if (PyNumber_Check(v)) {
12718 PyObject *iobj=NULL;
12719
12720 if (PyLong_Check(v)) {
12721 iobj = v;
12722 Py_INCREF(iobj);
12723 }
12724 else {
12725 iobj = PyNumber_Long(v);
12726 }
12727 if (iobj!=NULL) {
12728 if (PyLong_Check(iobj)) {
12729 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012730 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 Py_DECREF(iobj);
12732 if (!temp)
12733 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 if (PyUnicode_READY(temp) == -1) {
12735 Py_CLEAR(temp);
12736 goto onError;
12737 }
12738 pbuf = PyUnicode_DATA(temp);
12739 kind = PyUnicode_KIND(temp);
12740 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 sign = 1;
12742 }
12743 else {
12744 Py_DECREF(iobj);
12745 }
12746 }
12747 }
12748 if (!isnumok) {
12749 PyErr_Format(PyExc_TypeError,
12750 "%%%c format: a number is required, "
12751 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12752 goto onError;
12753 }
12754 if (flags & F_ZERO)
12755 fill = '0';
12756 break;
12757
12758 case 'e':
12759 case 'E':
12760 case 'f':
12761 case 'F':
12762 case 'g':
12763 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012764 temp = formatfloat(v, flags, prec, c);
12765 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 if (PyUnicode_READY(temp) == -1) {
12768 Py_CLEAR(temp);
12769 goto onError;
12770 }
12771 pbuf = PyUnicode_DATA(temp);
12772 kind = PyUnicode_KIND(temp);
12773 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 sign = 1;
12775 if (flags & F_ZERO)
12776 fill = '0';
12777 break;
12778
12779 case 'c':
12780 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012782 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 if (len < 0)
12784 goto onError;
12785 break;
12786
12787 default:
12788 PyErr_Format(PyExc_ValueError,
12789 "unsupported format character '%c' (0x%x) "
12790 "at index %zd",
12791 (31<=c && c<=126) ? (char)c : '?',
12792 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 goto onError;
12795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 /* pbuf is initialized here. */
12797 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12800 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12801 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 len--;
12803 }
12804 else if (flags & F_SIGN)
12805 sign = '+';
12806 else if (flags & F_BLANK)
12807 sign = ' ';
12808 else
12809 sign = 0;
12810 }
12811 if (width < len)
12812 width = len;
12813 if (rescnt - (sign != 0) < width) {
12814 reslen -= rescnt;
12815 rescnt = width + fmtcnt + 100;
12816 reslen += rescnt;
12817 if (reslen < 0) {
12818 Py_XDECREF(temp);
12819 PyErr_NoMemory();
12820 goto onError;
12821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12823 if (res0 == 0) {
12824 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 Py_XDECREF(temp);
12826 goto onError;
12827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 }
12830 if (sign) {
12831 if (fill != ' ')
12832 *res++ = sign;
12833 rescnt--;
12834 if (width > len)
12835 width--;
12836 }
12837 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12839 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12842 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 }
12844 rescnt -= 2;
12845 width -= 2;
12846 if (width < 0)
12847 width = 0;
12848 len -= 2;
12849 }
12850 if (width > len && !(flags & F_LJUST)) {
12851 do {
12852 --rescnt;
12853 *res++ = fill;
12854 } while (--width > len);
12855 }
12856 if (fill == ' ') {
12857 if (sign)
12858 *res++ = sign;
12859 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12861 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12862 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12863 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012864 }
12865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 /* Copy all characters, preserving len */
12867 len1 = len;
12868 while (len1--) {
12869 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12870 rescnt--;
12871 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 while (--width >= len) {
12873 --rescnt;
12874 *res++ = ' ';
12875 }
12876 if (dict && (argidx < arglen) && c != '%') {
12877 PyErr_SetString(PyExc_TypeError,
12878 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012879 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 goto onError;
12881 }
12882 Py_XDECREF(temp);
12883 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884 } /* until end */
12885 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 PyErr_SetString(PyExc_TypeError,
12887 "not all arguments converted during string formatting");
12888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889 }
12890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891
12892 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12893 if (*res > max)
12894 max = *res;
12895 result = PyUnicode_New(reslen - rescnt, max);
12896 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 kind = PyUnicode_KIND(result);
12899 for (res = res0; res < res0+reslen-rescnt; res++)
12900 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12901 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904 }
12905 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906 return (PyObject *)result;
12907
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910 Py_DECREF(uformat);
12911 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913 }
12914 return NULL;
12915}
12916
Jeremy Hylton938ace62002-07-17 16:30:39 +000012917static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012918unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12919
Tim Peters6d6c1a32001-08-02 04:15:00 +000012920static PyObject *
12921unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12922{
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012924 static char *kwlist[] = {"object", "encoding", "errors", 0};
12925 char *encoding = NULL;
12926 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012927
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 if (type != &PyUnicode_Type)
12929 return unicode_subtype_new(type, args, kwds);
12930 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012932 return NULL;
12933 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012935 if (encoding == NULL && errors == NULL)
12936 return PyObject_Str(x);
12937 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012939}
12940
Guido van Rossume023fe02001-08-30 03:12:59 +000012941static PyObject *
12942unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12943{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012944 PyUnicodeObject *unicode, *self;
12945 Py_ssize_t length, char_size;
12946 int share_wstr, share_utf8;
12947 unsigned int kind;
12948 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012949
Benjamin Peterson14339b62009-01-31 16:36:08 +000012950 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012951
12952 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12953 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012954 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012955 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020012956 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012957 return NULL;
12958
12959 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12960 if (self == NULL) {
12961 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012962 return NULL;
12963 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012964 kind = PyUnicode_KIND(unicode);
12965 length = PyUnicode_GET_LENGTH(unicode);
12966
12967 _PyUnicode_LENGTH(self) = length;
12968 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12969 _PyUnicode_STATE(self).interned = 0;
12970 _PyUnicode_STATE(self).kind = kind;
12971 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012972 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012973 _PyUnicode_STATE(self).ready = 1;
12974 _PyUnicode_WSTR(self) = NULL;
12975 _PyUnicode_UTF8_LENGTH(self) = 0;
12976 _PyUnicode_UTF8(self) = NULL;
12977 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012978 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012979
12980 share_utf8 = 0;
12981 share_wstr = 0;
12982 if (kind == PyUnicode_1BYTE_KIND) {
12983 char_size = 1;
12984 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12985 share_utf8 = 1;
12986 }
12987 else if (kind == PyUnicode_2BYTE_KIND) {
12988 char_size = 2;
12989 if (sizeof(wchar_t) == 2)
12990 share_wstr = 1;
12991 }
12992 else {
12993 assert(kind == PyUnicode_4BYTE_KIND);
12994 char_size = 4;
12995 if (sizeof(wchar_t) == 4)
12996 share_wstr = 1;
12997 }
12998
12999 /* Ensure we won't overflow the length. */
13000 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13001 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013004 data = PyObject_MALLOC((length + 1) * char_size);
13005 if (data == NULL) {
13006 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 goto onError;
13008 }
13009
Victor Stinnerc3c74152011-10-02 20:39:55 +020013010 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013011 if (share_utf8) {
13012 _PyUnicode_UTF8_LENGTH(self) = length;
13013 _PyUnicode_UTF8(self) = data;
13014 }
13015 if (share_wstr) {
13016 _PyUnicode_WSTR_LENGTH(self) = length;
13017 _PyUnicode_WSTR(self) = (wchar_t *)data;
13018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013020 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13021 PyUnicode_KIND_SIZE(kind, length + 1));
13022 Py_DECREF(unicode);
13023 return (PyObject *)self;
13024
13025onError:
13026 Py_DECREF(unicode);
13027 Py_DECREF(self);
13028 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013029}
13030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013031PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013033\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013034Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013035encoding defaults to the current default string encoding.\n\
13036errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013037
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013038static PyObject *unicode_iter(PyObject *seq);
13039
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013041 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013042 "str", /* tp_name */
13043 sizeof(PyUnicodeObject), /* tp_size */
13044 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013046 (destructor)unicode_dealloc, /* tp_dealloc */
13047 0, /* tp_print */
13048 0, /* tp_getattr */
13049 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013050 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013051 unicode_repr, /* tp_repr */
13052 &unicode_as_number, /* tp_as_number */
13053 &unicode_as_sequence, /* tp_as_sequence */
13054 &unicode_as_mapping, /* tp_as_mapping */
13055 (hashfunc) unicode_hash, /* tp_hash*/
13056 0, /* tp_call*/
13057 (reprfunc) unicode_str, /* tp_str */
13058 PyObject_GenericGetAttr, /* tp_getattro */
13059 0, /* tp_setattro */
13060 0, /* tp_as_buffer */
13061 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 unicode_doc, /* tp_doc */
13064 0, /* tp_traverse */
13065 0, /* tp_clear */
13066 PyUnicode_RichCompare, /* tp_richcompare */
13067 0, /* tp_weaklistoffset */
13068 unicode_iter, /* tp_iter */
13069 0, /* tp_iternext */
13070 unicode_methods, /* tp_methods */
13071 0, /* tp_members */
13072 0, /* tp_getset */
13073 &PyBaseObject_Type, /* tp_base */
13074 0, /* tp_dict */
13075 0, /* tp_descr_get */
13076 0, /* tp_descr_set */
13077 0, /* tp_dictoffset */
13078 0, /* tp_init */
13079 0, /* tp_alloc */
13080 unicode_new, /* tp_new */
13081 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082};
13083
13084/* Initialize the Unicode implementation */
13085
Thomas Wouters78890102000-07-22 19:25:51 +000013086void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013088 int i;
13089
Thomas Wouters477c8d52006-05-27 19:21:47 +000013090 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013092 0x000A, /* LINE FEED */
13093 0x000D, /* CARRIAGE RETURN */
13094 0x001C, /* FILE SEPARATOR */
13095 0x001D, /* GROUP SEPARATOR */
13096 0x001E, /* RECORD SEPARATOR */
13097 0x0085, /* NEXT LINE */
13098 0x2028, /* LINE SEPARATOR */
13099 0x2029, /* PARAGRAPH SEPARATOR */
13100 };
13101
Fred Drakee4315f52000-05-09 19:53:39 +000013102 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013103 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013104 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013107 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013109 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013111
13112 /* initialize the linebreak bloom filter */
13113 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013115 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013116
13117 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118}
13119
13120/* Finalize the Unicode implementation */
13121
Christian Heimesa156e092008-02-16 07:38:31 +000013122int
13123PyUnicode_ClearFreeList(void)
13124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013126}
13127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128void
Thomas Wouters78890102000-07-22 19:25:51 +000013129_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013131 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013133 Py_XDECREF(unicode_empty);
13134 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013135
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013136 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 if (unicode_latin1[i]) {
13138 Py_DECREF(unicode_latin1[i]);
13139 unicode_latin1[i] = NULL;
13140 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013141 }
Christian Heimesa156e092008-02-16 07:38:31 +000013142 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013144
Walter Dörwald16807132007-05-25 13:52:07 +000013145void
13146PyUnicode_InternInPlace(PyObject **p)
13147{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13149 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013150#ifdef Py_DEBUG
13151 assert(s != NULL);
13152 assert(_PyUnicode_CHECK(s));
13153#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013155 return;
13156#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 /* If it's a subclass, we don't really know what putting
13158 it in the interned dict might do. */
13159 if (!PyUnicode_CheckExact(s))
13160 return;
13161 if (PyUnicode_CHECK_INTERNED(s))
13162 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013163 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013164 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 return;
13166 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013167 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 if (interned == NULL) {
13169 interned = PyDict_New();
13170 if (interned == NULL) {
13171 PyErr_Clear(); /* Don't leave an exception */
13172 return;
13173 }
13174 }
13175 /* It might be that the GetItem call fails even
13176 though the key is present in the dictionary,
13177 namely when this happens during a stack overflow. */
13178 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013181
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 if (t) {
13183 Py_INCREF(t);
13184 Py_DECREF(*p);
13185 *p = t;
13186 return;
13187 }
Walter Dörwald16807132007-05-25 13:52:07 +000013188
Benjamin Peterson14339b62009-01-31 16:36:08 +000013189 PyThreadState_GET()->recursion_critical = 1;
13190 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13191 PyErr_Clear();
13192 PyThreadState_GET()->recursion_critical = 0;
13193 return;
13194 }
13195 PyThreadState_GET()->recursion_critical = 0;
13196 /* The two references in interned are not counted by refcnt.
13197 The deallocator will take care of this */
13198 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013200}
13201
13202void
13203PyUnicode_InternImmortal(PyObject **p)
13204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13206
Benjamin Peterson14339b62009-01-31 16:36:08 +000013207 PyUnicode_InternInPlace(p);
13208 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 Py_INCREF(*p);
13211 }
Walter Dörwald16807132007-05-25 13:52:07 +000013212}
13213
13214PyObject *
13215PyUnicode_InternFromString(const char *cp)
13216{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 PyObject *s = PyUnicode_FromString(cp);
13218 if (s == NULL)
13219 return NULL;
13220 PyUnicode_InternInPlace(&s);
13221 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013222}
13223
Alexander Belopolsky40018472011-02-26 01:02:56 +000013224void
13225_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013227 PyObject *keys;
13228 PyUnicodeObject *s;
13229 Py_ssize_t i, n;
13230 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013231
Benjamin Peterson14339b62009-01-31 16:36:08 +000013232 if (interned == NULL || !PyDict_Check(interned))
13233 return;
13234 keys = PyDict_Keys(interned);
13235 if (keys == NULL || !PyList_Check(keys)) {
13236 PyErr_Clear();
13237 return;
13238 }
Walter Dörwald16807132007-05-25 13:52:07 +000013239
Benjamin Peterson14339b62009-01-31 16:36:08 +000013240 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13241 detector, interned unicode strings are not forcibly deallocated;
13242 rather, we give them their stolen references back, and then clear
13243 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013244
Benjamin Peterson14339b62009-01-31 16:36:08 +000013245 n = PyList_GET_SIZE(keys);
13246 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013248 for (i = 0; i < n; i++) {
13249 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013250 if (PyUnicode_READY(s) == -1) {
13251 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 case SSTATE_NOT_INTERNED:
13256 /* XXX Shouldn't happen */
13257 break;
13258 case SSTATE_INTERNED_IMMORTAL:
13259 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 break;
13262 case SSTATE_INTERNED_MORTAL:
13263 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265 break;
13266 default:
13267 Py_FatalError("Inconsistent interned string state.");
13268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 }
13271 fprintf(stderr, "total size of all interned strings: "
13272 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13273 "mortal/immortal\n", mortal_size, immortal_size);
13274 Py_DECREF(keys);
13275 PyDict_Clear(interned);
13276 Py_DECREF(interned);
13277 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013278}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013279
13280
13281/********************* Unicode Iterator **************************/
13282
13283typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 PyObject_HEAD
13285 Py_ssize_t it_index;
13286 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013287} unicodeiterobject;
13288
13289static void
13290unicodeiter_dealloc(unicodeiterobject *it)
13291{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013292 _PyObject_GC_UNTRACK(it);
13293 Py_XDECREF(it->it_seq);
13294 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013295}
13296
13297static int
13298unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13299{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013300 Py_VISIT(it->it_seq);
13301 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013302}
13303
13304static PyObject *
13305unicodeiter_next(unicodeiterobject *it)
13306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013307 PyUnicodeObject *seq;
13308 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013309
Benjamin Peterson14339b62009-01-31 16:36:08 +000013310 assert(it != NULL);
13311 seq = it->it_seq;
13312 if (seq == NULL)
13313 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013314 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13317 int kind = PyUnicode_KIND(seq);
13318 void *data = PyUnicode_DATA(seq);
13319 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13320 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013321 if (item != NULL)
13322 ++it->it_index;
13323 return item;
13324 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013325
Benjamin Peterson14339b62009-01-31 16:36:08 +000013326 Py_DECREF(seq);
13327 it->it_seq = NULL;
13328 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013329}
13330
13331static PyObject *
13332unicodeiter_len(unicodeiterobject *it)
13333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013334 Py_ssize_t len = 0;
13335 if (it->it_seq)
13336 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13337 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013338}
13339
13340PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13341
13342static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013343 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013344 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013346};
13347
13348PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013349 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13350 "str_iterator", /* tp_name */
13351 sizeof(unicodeiterobject), /* tp_basicsize */
13352 0, /* tp_itemsize */
13353 /* methods */
13354 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13355 0, /* tp_print */
13356 0, /* tp_getattr */
13357 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013358 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 0, /* tp_repr */
13360 0, /* tp_as_number */
13361 0, /* tp_as_sequence */
13362 0, /* tp_as_mapping */
13363 0, /* tp_hash */
13364 0, /* tp_call */
13365 0, /* tp_str */
13366 PyObject_GenericGetAttr, /* tp_getattro */
13367 0, /* tp_setattro */
13368 0, /* tp_as_buffer */
13369 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13370 0, /* tp_doc */
13371 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13372 0, /* tp_clear */
13373 0, /* tp_richcompare */
13374 0, /* tp_weaklistoffset */
13375 PyObject_SelfIter, /* tp_iter */
13376 (iternextfunc)unicodeiter_next, /* tp_iternext */
13377 unicodeiter_methods, /* tp_methods */
13378 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013379};
13380
13381static PyObject *
13382unicode_iter(PyObject *seq)
13383{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013384 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013385
Benjamin Peterson14339b62009-01-31 16:36:08 +000013386 if (!PyUnicode_Check(seq)) {
13387 PyErr_BadInternalCall();
13388 return NULL;
13389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 if (PyUnicode_READY(seq) == -1)
13391 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013392 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13393 if (it == NULL)
13394 return NULL;
13395 it->it_index = 0;
13396 Py_INCREF(seq);
13397 it->it_seq = (PyUnicodeObject *)seq;
13398 _PyObject_GC_TRACK(it);
13399 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013400}
13401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402#define UNIOP(x) Py_UNICODE_##x
13403#define UNIOP_t Py_UNICODE
13404#include "uniops.h"
13405#undef UNIOP
13406#undef UNIOP_t
13407#define UNIOP(x) Py_UCS4_##x
13408#define UNIOP_t Py_UCS4
13409#include "uniops.h"
13410#undef UNIOP
13411#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013412
Victor Stinner71133ff2010-09-01 23:43:53 +000013413Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013414PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013415{
13416 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13417 Py_UNICODE *copy;
13418 Py_ssize_t size;
13419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 if (!PyUnicode_Check(unicode)) {
13421 PyErr_BadArgument();
13422 return NULL;
13423 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013424 /* Ensure we won't overflow the size. */
13425 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13426 PyErr_NoMemory();
13427 return NULL;
13428 }
13429 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13430 size *= sizeof(Py_UNICODE);
13431 copy = PyMem_Malloc(size);
13432 if (copy == NULL) {
13433 PyErr_NoMemory();
13434 return NULL;
13435 }
13436 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13437 return copy;
13438}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013439
Georg Brandl66c221e2010-10-14 07:04:07 +000013440/* A _string module, to export formatter_parser and formatter_field_name_split
13441 to the string.Formatter class implemented in Python. */
13442
13443static PyMethodDef _string_methods[] = {
13444 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13445 METH_O, PyDoc_STR("split the argument as a field name")},
13446 {"formatter_parser", (PyCFunction) formatter_parser,
13447 METH_O, PyDoc_STR("parse the argument as a format string")},
13448 {NULL, NULL}
13449};
13450
13451static struct PyModuleDef _string_module = {
13452 PyModuleDef_HEAD_INIT,
13453 "_string",
13454 PyDoc_STR("string helper module"),
13455 0,
13456 _string_methods,
13457 NULL,
13458 NULL,
13459 NULL,
13460 NULL
13461};
13462
13463PyMODINIT_FUNC
13464PyInit__string(void)
13465{
13466 return PyModule_Create(&_string_module);
13467}
13468
13469
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013470#ifdef __cplusplus
13471}
13472#endif