blob: df2c471c96ac9e918c49e26ebb61395fbec42041 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
372#endif
373
Thomas Wouters477c8d52006-05-27 19:21:47 +0000374/* --- Bloom Filters ----------------------------------------------------- */
375
376/* stuff to implement simple "bloom filters" for Unicode characters.
377 to keep things simple, we use a single bitmask, using the least 5
378 bits from each unicode characters as the bit index. */
379
380/* the linebreak mask is set up by Unicode_Init below */
381
Antoine Pitrouf068f942010-01-13 14:19:12 +0000382#if LONG_BIT >= 128
383#define BLOOM_WIDTH 128
384#elif LONG_BIT >= 64
385#define BLOOM_WIDTH 64
386#elif LONG_BIT >= 32
387#define BLOOM_WIDTH 32
388#else
389#error "LONG_BIT is smaller than 32"
390#endif
391
Thomas Wouters477c8d52006-05-27 19:21:47 +0000392#define BLOOM_MASK unsigned long
393
394static BLOOM_MASK bloom_linebreak;
395
Antoine Pitrouf068f942010-01-13 14:19:12 +0000396#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
397#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398
Benjamin Peterson29060642009-01-31 22:14:21 +0000399#define BLOOM_LINEBREAK(ch) \
400 ((ch) < 128U ? ascii_linebreak[(ch)] : \
401 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402
Alexander Belopolsky40018472011-02-26 01:02:56 +0000403Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000405{
406 /* calculate simple bloom-style bitmask for a given unicode string */
407
Antoine Pitrouf068f942010-01-13 14:19:12 +0000408 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000409 Py_ssize_t i;
410
411 mask = 0;
412 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000414
415 return mask;
416}
417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418#define BLOOM_MEMBER(mask, chr, str) \
419 (BLOOM(mask, chr) \
420 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000421
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422/* --- Unicode Object ----------------------------------------------------- */
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200425fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
426
427Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
428 Py_ssize_t size, Py_UCS4 ch,
429 int direction)
430{
431 /* like wcschr, but doesn't stop at NULL characters */
432 Py_ssize_t i;
433 if (direction == 1) {
434 for(i = 0; i < size; i++)
435 if (PyUnicode_READ(kind, s, i) == ch)
436 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
437 }
438 else {
439 for(i = size-1; i >= 0; i--)
440 if (PyUnicode_READ(kind, s, i) == ch)
441 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
442 }
443 return NULL;
444}
445
Victor Stinnerfe226c02011-10-03 03:52:20 +0200446static PyObject*
447resize_compact(PyObject *unicode, Py_ssize_t length)
448{
449 Py_ssize_t char_size;
450 Py_ssize_t struct_size;
451 Py_ssize_t new_size;
452 int share_wstr;
453
454 assert(PyUnicode_IS_READY(unicode));
455 char_size = PyUnicode_CHARACTER_SIZE(unicode);
456 if (PyUnicode_IS_COMPACT_ASCII(unicode))
457 struct_size = sizeof(PyASCIIObject);
458 else
459 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200460 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200461
462 _Py_DEC_REFTOTAL;
463 _Py_ForgetReference(unicode);
464
465 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
466 PyErr_NoMemory();
467 return NULL;
468 }
469 new_size = (struct_size + (length + 1) * char_size);
470
471 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
472 if (unicode == NULL) {
473 PyObject_Del(unicode);
474 PyErr_NoMemory();
475 return NULL;
476 }
477 _Py_NewReference(unicode);
478 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200479 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200480 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200481 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
482 _PyUnicode_WSTR_LENGTH(unicode) = length;
483 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200484 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
485 length, 0);
486 return unicode;
487}
488
Alexander Belopolsky40018472011-02-26 01:02:56 +0000489static int
Victor Stinner95663112011-10-04 01:03:50 +0200490resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491{
Victor Stinner95663112011-10-04 01:03:50 +0200492 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200493 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200494 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000495
Victor Stinner95663112011-10-04 01:03:50 +0200496 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200497
498 if (PyUnicode_IS_READY(unicode)) {
499 Py_ssize_t char_size;
500 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200501 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200502 void *data;
503
504 data = _PyUnicode_DATA_ANY(unicode);
505 assert(data != NULL);
506 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200507 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
508 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200509 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
510 {
511 PyObject_DEL(_PyUnicode_UTF8(unicode));
512 _PyUnicode_UTF8(unicode) = NULL;
513 _PyUnicode_UTF8_LENGTH(unicode) = 0;
514 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200515
516 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
517 PyErr_NoMemory();
518 return -1;
519 }
520 new_size = (length + 1) * char_size;
521
522 data = (PyObject *)PyObject_REALLOC(data, new_size);
523 if (data == NULL) {
524 PyErr_NoMemory();
525 return -1;
526 }
527 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200528 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200529 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200530 _PyUnicode_WSTR_LENGTH(unicode) = length;
531 }
532 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200533 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 _PyUnicode_UTF8_LENGTH(unicode) = length;
535 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200536 _PyUnicode_LENGTH(unicode) = length;
537 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200538 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
539 _PyUnicode_CHECK(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200540 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 }
Victor Stinner95663112011-10-04 01:03:50 +0200543 assert(_PyUnicode_WSTR(unicode) != NULL);
544
545 /* check for integer overflow */
546 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
547 PyErr_NoMemory();
548 return -1;
549 }
550 wstr = _PyUnicode_WSTR(unicode);
551 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
552 if (!wstr) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 _PyUnicode_WSTR(unicode) = wstr;
557 _PyUnicode_WSTR(unicode)[length] = 0;
558 _PyUnicode_WSTR_LENGTH(unicode) = length;
559 _PyUnicode_CHECK(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000560 return 0;
561}
562
Victor Stinnerfe226c02011-10-03 03:52:20 +0200563static PyObject*
564resize_copy(PyObject *unicode, Py_ssize_t length)
565{
566 Py_ssize_t copy_length;
567 if (PyUnicode_IS_COMPACT(unicode)) {
568 PyObject *copy;
569 assert(PyUnicode_IS_READY(unicode));
570
571 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
572 if (copy == NULL)
573 return NULL;
574
575 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
576 if (PyUnicode_CopyCharacters(copy, 0,
577 unicode, 0,
578 copy_length) < 0)
579 {
580 Py_DECREF(copy);
581 return NULL;
582 }
583 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200584 }
585 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200586 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200587 assert(_PyUnicode_WSTR(unicode) != NULL);
588 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200589 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200590 if (w == NULL)
591 return NULL;
592 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
593 copy_length = Py_MIN(copy_length, length);
594 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
595 copy_length);
596 return (PyObject*)w;
597 }
598}
599
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000601 Ux0000 terminated; some code (e.g. new_identifier)
602 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000605 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606
607*/
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609#ifdef Py_DEBUG
610int unicode_old_new_calls = 0;
611#endif
612
Alexander Belopolsky40018472011-02-26 01:02:56 +0000613static PyUnicodeObject *
614_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615{
616 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618
Thomas Wouters477c8d52006-05-27 19:21:47 +0000619 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000620 if (length == 0 && unicode_empty != NULL) {
621 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200622 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000625 /* Ensure we won't overflow the size. */
626 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
627 return (PyUnicodeObject *)PyErr_NoMemory();
628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629 if (length < 0) {
630 PyErr_SetString(PyExc_SystemError,
631 "Negative size passed to _PyUnicode_New");
632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000633 }
634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635#ifdef Py_DEBUG
636 ++unicode_old_new_calls;
637#endif
638
639 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
640 if (unicode == NULL)
641 return NULL;
642 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
643 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
644 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_NoMemory();
646 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648
Jeremy Hyltond8082792003-09-16 19:41:39 +0000649 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000650 * the caller fails before initializing str -- unicode_resize()
651 * reads str[0], and the Keep-Alive optimization can keep memory
652 * allocated for str alive across a call to unicode_dealloc(unicode).
653 * We don't want unicode_resize to read uninitialized memory in
654 * that case.
655 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200656 _PyUnicode_WSTR(unicode)[0] = 0;
657 _PyUnicode_WSTR(unicode)[length] = 0;
658 _PyUnicode_WSTR_LENGTH(unicode) = length;
659 _PyUnicode_HASH(unicode) = -1;
660 _PyUnicode_STATE(unicode).interned = 0;
661 _PyUnicode_STATE(unicode).kind = 0;
662 _PyUnicode_STATE(unicode).compact = 0;
663 _PyUnicode_STATE(unicode).ready = 0;
664 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200665 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200667 _PyUnicode_UTF8(unicode) = NULL;
668 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000669 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000670
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000672 /* XXX UNREF/NEWREF interface should be more symmetrical */
673 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000674 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000675 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000677}
678
Victor Stinnerf42dc442011-10-02 23:33:16 +0200679static const char*
680unicode_kind_name(PyObject *unicode)
681{
Victor Stinner42dfd712011-10-03 14:41:45 +0200682 /* don't check consistency: unicode_kind_name() is called from
683 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200684 if (!PyUnicode_IS_COMPACT(unicode))
685 {
686 if (!PyUnicode_IS_READY(unicode))
687 return "wstr";
688 switch(PyUnicode_KIND(unicode))
689 {
690 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200691 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200692 return "legacy ascii";
693 else
694 return "legacy latin1";
695 case PyUnicode_2BYTE_KIND:
696 return "legacy UCS2";
697 case PyUnicode_4BYTE_KIND:
698 return "legacy UCS4";
699 default:
700 return "<legacy invalid kind>";
701 }
702 }
703 assert(PyUnicode_IS_READY(unicode));
704 switch(PyUnicode_KIND(unicode))
705 {
706 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200707 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200708 return "ascii";
709 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200710 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200711 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200712 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200713 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200714 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 default:
716 return "<invalid compact kind>";
717 }
718}
719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720#ifdef Py_DEBUG
721int unicode_new_new_calls = 0;
722
723/* Functions wrapping macros for use in debugger */
724char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200725 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726}
727
728void *_PyUnicode_compact_data(void *unicode) {
729 return _PyUnicode_COMPACT_DATA(unicode);
730}
731void *_PyUnicode_data(void *unicode){
732 printf("obj %p\n", unicode);
733 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
734 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
735 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
736 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
737 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
738 return PyUnicode_DATA(unicode);
739}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200740
741void
742_PyUnicode_Dump(PyObject *op)
743{
744 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200745 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
746 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
747 void *data;
748 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
749 if (ascii->state.compact)
750 data = (compact + 1);
751 else
752 data = unicode->data.any;
753 if (ascii->wstr == data)
754 printf("shared ");
755 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200756 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200757 printf(" (%zu), ", compact->wstr_length);
758 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
759 printf("shared ");
760 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200761 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200762 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200763}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200764#endif
765
766PyObject *
767PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
768{
769 PyObject *obj;
770 PyCompactUnicodeObject *unicode;
771 void *data;
772 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200773 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774 Py_ssize_t char_size;
775 Py_ssize_t struct_size;
776
777 /* Optimization for empty strings */
778 if (size == 0 && unicode_empty != NULL) {
779 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200780 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 }
782
783#ifdef Py_DEBUG
784 ++unicode_new_new_calls;
785#endif
786
Victor Stinner9e9d6892011-10-04 01:02:02 +0200787 is_ascii = 0;
788 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789 struct_size = sizeof(PyCompactUnicodeObject);
790 if (maxchar < 128) {
791 kind_state = PyUnicode_1BYTE_KIND;
792 char_size = 1;
793 is_ascii = 1;
794 struct_size = sizeof(PyASCIIObject);
795 }
796 else if (maxchar < 256) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 }
800 else if (maxchar < 65536) {
801 kind_state = PyUnicode_2BYTE_KIND;
802 char_size = 2;
803 if (sizeof(wchar_t) == 2)
804 is_sharing = 1;
805 }
806 else {
807 kind_state = PyUnicode_4BYTE_KIND;
808 char_size = 4;
809 if (sizeof(wchar_t) == 4)
810 is_sharing = 1;
811 }
812
813 /* Ensure we won't overflow the size. */
814 if (size < 0) {
815 PyErr_SetString(PyExc_SystemError,
816 "Negative size passed to PyUnicode_New");
817 return NULL;
818 }
819 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
820 return PyErr_NoMemory();
821
822 /* Duplicated allocation code from _PyObject_New() instead of a call to
823 * PyObject_New() so we are able to allocate space for the object and
824 * it's data buffer.
825 */
826 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
827 if (obj == NULL)
828 return PyErr_NoMemory();
829 obj = PyObject_INIT(obj, &PyUnicode_Type);
830 if (obj == NULL)
831 return NULL;
832
833 unicode = (PyCompactUnicodeObject *)obj;
834 if (is_ascii)
835 data = ((PyASCIIObject*)obj) + 1;
836 else
837 data = unicode + 1;
838 _PyUnicode_LENGTH(unicode) = size;
839 _PyUnicode_HASH(unicode) = -1;
840 _PyUnicode_STATE(unicode).interned = 0;
841 _PyUnicode_STATE(unicode).kind = kind_state;
842 _PyUnicode_STATE(unicode).compact = 1;
843 _PyUnicode_STATE(unicode).ready = 1;
844 _PyUnicode_STATE(unicode).ascii = is_ascii;
845 if (is_ascii) {
846 ((char*)data)[size] = 0;
847 _PyUnicode_WSTR(unicode) = NULL;
848 }
849 else if (kind_state == PyUnicode_1BYTE_KIND) {
850 ((char*)data)[size] = 0;
851 _PyUnicode_WSTR(unicode) = NULL;
852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200854 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855 }
856 else {
857 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200858 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 if (kind_state == PyUnicode_2BYTE_KIND)
860 ((Py_UCS2*)data)[size] = 0;
861 else /* kind_state == PyUnicode_4BYTE_KIND */
862 ((Py_UCS4*)data)[size] = 0;
863 if (is_sharing) {
864 _PyUnicode_WSTR_LENGTH(unicode) = size;
865 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
866 }
867 else {
868 _PyUnicode_WSTR_LENGTH(unicode) = 0;
869 _PyUnicode_WSTR(unicode) = NULL;
870 }
871 }
872 return obj;
873}
874
875#if SIZEOF_WCHAR_T == 2
876/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
877 will decode surrogate pairs, the other conversions are implemented as macros
878 for efficency.
879
880 This function assumes that unicode can hold one more code point than wstr
881 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200882static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
884 PyUnicodeObject *unicode)
885{
886 const wchar_t *iter;
887 Py_UCS4 *ucs4_out;
888
Victor Stinner910337b2011-10-03 03:20:16 +0200889 assert(unicode != NULL);
890 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
892 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
893
894 for (iter = begin; iter < end; ) {
895 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
896 _PyUnicode_GET_LENGTH(unicode)));
897 if (*iter >= 0xD800 && *iter <= 0xDBFF
898 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
899 {
900 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
901 iter += 2;
902 }
903 else {
904 *ucs4_out++ = *iter;
905 iter++;
906 }
907 }
908 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
909 _PyUnicode_GET_LENGTH(unicode)));
910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911}
912#endif
913
Victor Stinnercd9950f2011-10-02 00:34:53 +0200914static int
915_PyUnicode_Dirty(PyObject *unicode)
916{
Victor Stinner910337b2011-10-03 03:20:16 +0200917 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200918 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200919 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920 "Cannot modify a string having more than 1 reference");
921 return -1;
922 }
923 _PyUnicode_DIRTY(unicode);
924 return 0;
925}
926
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200927Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
929 PyObject *from, Py_ssize_t from_start,
930 Py_ssize_t how_many)
931{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200932 unsigned int from_kind, to_kind;
933 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934
Victor Stinnerb1536152011-09-30 02:26:10 +0200935 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939
940 if (PyUnicode_READY(from))
941 return -1;
942 if (PyUnicode_READY(to))
943 return -1;
944
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200945 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200946 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200947 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200948 "Cannot write %zi characters at %zi "
949 "in a string of %zi characters",
950 how_many, to_start, PyUnicode_GET_LENGTH(to));
951 return -1;
952 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200953 if (how_many == 0)
954 return 0;
955
Victor Stinnercd9950f2011-10-02 00:34:53 +0200956 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200957 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200960 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200962 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 if (from_kind == to_kind
965 /* deny latin1 => ascii */
966 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
967 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200969 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200970 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200971 + PyUnicode_KIND_SIZE(from_kind, from_start),
972 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 else if (from_kind == PyUnicode_1BYTE_KIND
975 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200976 {
977 _PyUnicode_CONVERT_BYTES(
978 Py_UCS1, Py_UCS2,
979 PyUnicode_1BYTE_DATA(from) + from_start,
980 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
981 PyUnicode_2BYTE_DATA(to) + to_start
982 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200983 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200984 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200985 && to_kind == PyUnicode_4BYTE_KIND)
986 {
987 _PyUnicode_CONVERT_BYTES(
988 Py_UCS1, Py_UCS4,
989 PyUnicode_1BYTE_DATA(from) + from_start,
990 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
991 PyUnicode_4BYTE_DATA(to) + to_start
992 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200993 }
994 else if (from_kind == PyUnicode_2BYTE_KIND
995 && to_kind == PyUnicode_4BYTE_KIND)
996 {
997 _PyUnicode_CONVERT_BYTES(
998 Py_UCS2, Py_UCS4,
999 PyUnicode_2BYTE_DATA(from) + from_start,
1000 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1001 PyUnicode_4BYTE_DATA(to) + to_start
1002 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001003 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001004 else {
1005 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001006
1007 /* check if max_char(from substring) <= max_char(to) */
1008 if (from_kind > to_kind
1009 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001010 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001011 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001012 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001013 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 /* slow path to check for character overflow */
1015 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1016 Py_UCS4 ch, maxchar;
1017 Py_ssize_t i;
1018
1019 maxchar = 0;
1020 invalid_kinds = 0;
1021 for (i=0; i < how_many; i++) {
1022 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1023 if (ch > maxchar) {
1024 maxchar = ch;
1025 if (maxchar > to_maxchar) {
1026 invalid_kinds = 1;
1027 break;
1028 }
1029 }
1030 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1031 }
1032 }
1033 else
1034 invalid_kinds = 1;
1035 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001036 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001037 "Cannot copy %s characters "
1038 "into a string of %s characters",
1039 unicode_kind_name(from),
1040 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001041 return -1;
1042 }
1043 }
1044 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045}
1046
Victor Stinner17222162011-09-28 22:15:37 +02001047/* Find the maximum code point and count the number of surrogate pairs so a
1048 correct string length can be computed before converting a string to UCS4.
1049 This function counts single surrogates as a character and not as a pair.
1050
1051 Return 0 on success, or -1 on error. */
1052static int
1053find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1054 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055{
1056 const wchar_t *iter;
1057
Victor Stinnerc53be962011-10-02 21:33:54 +02001058 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 if (num_surrogates == NULL || maxchar == NULL) {
1060 PyErr_SetString(PyExc_SystemError,
1061 "unexpected NULL arguments to "
1062 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1063 return -1;
1064 }
1065
1066 *num_surrogates = 0;
1067 *maxchar = 0;
1068
1069 for (iter = begin; iter < end; ) {
1070 if (*iter > *maxchar)
1071 *maxchar = *iter;
1072#if SIZEOF_WCHAR_T == 2
1073 if (*iter >= 0xD800 && *iter <= 0xDBFF
1074 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1075 {
1076 Py_UCS4 surrogate_val;
1077 surrogate_val = (((iter[0] & 0x3FF)<<10)
1078 | (iter[1] & 0x3FF)) + 0x10000;
1079 ++(*num_surrogates);
1080 if (surrogate_val > *maxchar)
1081 *maxchar = surrogate_val;
1082 iter += 2;
1083 }
1084 else
1085 iter++;
1086#else
1087 iter++;
1088#endif
1089 }
1090 return 0;
1091}
1092
1093#ifdef Py_DEBUG
1094int unicode_ready_calls = 0;
1095#endif
1096
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001097static int
1098unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001100 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 wchar_t *end;
1102 Py_UCS4 maxchar = 0;
1103 Py_ssize_t num_surrogates;
1104#if SIZEOF_WCHAR_T == 2
1105 Py_ssize_t length_wo_surrogates;
1106#endif
1107
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001108 assert(p_obj != NULL);
1109 unicode = (PyUnicodeObject *)*p_obj;
1110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001112 strings were created using _PyObject_New() and where no canonical
1113 representation (the str field) has been set yet aka strings
1114 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001115 assert(_PyUnicode_CHECK(unicode));
1116 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001118 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001119 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001120 /* Actually, it should neither be interned nor be anything else: */
1121 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122
1123#ifdef Py_DEBUG
1124 ++unicode_ready_calls;
1125#endif
1126
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001127#ifdef Py_DEBUG
1128 assert(!replace || Py_REFCNT(unicode) == 1);
1129#else
1130 if (replace && Py_REFCNT(unicode) != 1)
1131 replace = 0;
1132#endif
1133 if (replace) {
1134 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1135 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1136 /* Optimization for empty strings */
1137 if (len == 0) {
1138 Py_INCREF(unicode_empty);
1139 Py_DECREF(*p_obj);
1140 *p_obj = unicode_empty;
1141 return 0;
1142 }
1143 if (len == 1 && wstr[0] < 256) {
1144 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1145 if (latin1_char == NULL)
1146 return -1;
1147 Py_DECREF(*p_obj);
1148 *p_obj = latin1_char;
1149 return 0;
1150 }
1151 }
1152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001154 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001155 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157
1158 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001159 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1160 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161 PyErr_NoMemory();
1162 return -1;
1163 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001164 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 _PyUnicode_WSTR(unicode), end,
1166 PyUnicode_1BYTE_DATA(unicode));
1167 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1168 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1169 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1170 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001172 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001173 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 }
1175 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001176 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001179 }
1180 PyObject_FREE(_PyUnicode_WSTR(unicode));
1181 _PyUnicode_WSTR(unicode) = NULL;
1182 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1183 }
1184 /* In this case we might have to convert down from 4-byte native
1185 wchar_t to 2-byte unicode. */
1186 else if (maxchar < 65536) {
1187 assert(num_surrogates == 0 &&
1188 "FindMaxCharAndNumSurrogatePairs() messed up");
1189
Victor Stinner506f5922011-09-28 22:34:18 +02001190#if SIZEOF_WCHAR_T == 2
1191 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001192 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001193 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1194 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1195 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001196 _PyUnicode_UTF8(unicode) = NULL;
1197 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001198#else
1199 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001200 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001201 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001202 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001203 PyErr_NoMemory();
1204 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 }
Victor Stinner506f5922011-09-28 22:34:18 +02001206 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1207 _PyUnicode_WSTR(unicode), end,
1208 PyUnicode_2BYTE_DATA(unicode));
1209 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1210 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1211 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001212 _PyUnicode_UTF8(unicode) = NULL;
1213 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001214 PyObject_FREE(_PyUnicode_WSTR(unicode));
1215 _PyUnicode_WSTR(unicode) = NULL;
1216 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1217#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 }
1219 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1220 else {
1221#if SIZEOF_WCHAR_T == 2
1222 /* in case the native representation is 2-bytes, we need to allocate a
1223 new normalized 4-byte version. */
1224 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001225 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1226 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 PyErr_NoMemory();
1228 return -1;
1229 }
1230 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1231 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001234 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1235 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001236 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 PyObject_FREE(_PyUnicode_WSTR(unicode));
1238 _PyUnicode_WSTR(unicode) = NULL;
1239 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1240#else
1241 assert(num_surrogates == 0);
1242
Victor Stinnerc3c74152011-10-02 20:39:55 +02001243 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001245 _PyUnicode_UTF8(unicode) = NULL;
1246 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1248#endif
1249 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1250 }
1251 _PyUnicode_STATE(unicode).ready = 1;
1252 return 0;
1253}
1254
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001255int
1256_PyUnicode_ReadyReplace(PyObject **op)
1257{
1258 return unicode_ready(op, 1);
1259}
1260
1261int
1262_PyUnicode_Ready(PyObject *op)
1263{
1264 return unicode_ready(&op, 0);
1265}
1266
Alexander Belopolsky40018472011-02-26 01:02:56 +00001267static void
1268unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
Walter Dörwald16807132007-05-25 13:52:07 +00001270 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 case SSTATE_NOT_INTERNED:
1272 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001273
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 case SSTATE_INTERNED_MORTAL:
1275 /* revive dead object temporarily for DelItem */
1276 Py_REFCNT(unicode) = 3;
1277 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1278 Py_FatalError(
1279 "deletion of interned string failed");
1280 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 case SSTATE_INTERNED_IMMORTAL:
1283 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001284
Benjamin Peterson29060642009-01-31 22:14:21 +00001285 default:
1286 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001287 }
1288
Victor Stinner03490912011-10-03 23:45:12 +02001289 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001291 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001292 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293
1294 if (PyUnicode_IS_COMPACT(unicode)) {
1295 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 }
1297 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001298 if (_PyUnicode_DATA_ANY(unicode))
1299 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301 }
1302}
1303
Alexander Belopolsky40018472011-02-26 01:02:56 +00001304static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001305unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001306{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001307 if (Py_REFCNT(unicode) != 1)
1308 return 0;
1309 if (PyUnicode_CHECK_INTERNED(unicode))
1310 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001311 assert (unicode != unicode_empty);
1312#ifdef Py_DEBUG
1313 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1314 && PyUnicode_GET_LENGTH(unicode) == 1)
1315 {
1316 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001317 if (ch < 256 && unicode_latin1[ch] == unicode)
1318 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001319 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001320#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001321 return 1;
1322}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001323
Victor Stinnerfe226c02011-10-03 03:52:20 +02001324static int
1325unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1326{
1327 PyObject *unicode;
1328 Py_ssize_t old_length;
1329
1330 assert(p_unicode != NULL);
1331 unicode = *p_unicode;
1332
1333 assert(unicode != NULL);
1334 assert(PyUnicode_Check(unicode));
1335 assert(0 <= length);
1336
Victor Stinner910337b2011-10-03 03:20:16 +02001337 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001338 old_length = PyUnicode_WSTR_LENGTH(unicode);
1339 else
1340 old_length = PyUnicode_GET_LENGTH(unicode);
1341 if (old_length == length)
1342 return 0;
1343
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 if (!unicode_resizable(unicode)) {
1345 PyObject *copy = resize_copy(unicode, length);
1346 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001347 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001348 Py_DECREF(*p_unicode);
1349 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001350 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001351 }
1352
Victor Stinnerfe226c02011-10-03 03:52:20 +02001353 if (PyUnicode_IS_COMPACT(unicode)) {
1354 *p_unicode = resize_compact(unicode, length);
1355 if (*p_unicode == NULL)
1356 return -1;
Victor Stinner95663112011-10-04 01:03:50 +02001357 _PyUnicode_CHECK(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001358 return 0;
1359 } else
1360 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001361}
1362
Alexander Belopolsky40018472011-02-26 01:02:56 +00001363int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001365{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001366 PyObject *unicode;
1367 if (p_unicode == NULL) {
1368 PyErr_BadInternalCall();
1369 return -1;
1370 }
1371 unicode = *p_unicode;
1372 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1373 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1374 {
1375 PyErr_BadInternalCall();
1376 return -1;
1377 }
1378 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001379}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381static PyObject*
1382get_latin1_char(unsigned char ch)
1383{
Victor Stinnera464fc12011-10-02 20:39:30 +02001384 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001386 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 if (!unicode)
1388 return NULL;
1389 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1390 unicode_latin1[ch] = unicode;
1391 }
1392 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001393 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394}
1395
Alexander Belopolsky40018472011-02-26 01:02:56 +00001396PyObject *
1397PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
1399 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 Py_UCS4 maxchar = 0;
1401 Py_ssize_t num_surrogates;
1402
1403 if (u == NULL)
1404 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406 /* If the Unicode data is known at construction time, we can apply
1407 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 /* Optimization for empty strings */
1410 if (size == 0 && unicode_empty != NULL) {
1411 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001412 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001413 }
Tim Petersced69f82003-09-16 20:30:58 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Single character Unicode objects in the Latin-1 range are
1416 shared when using this constructor */
1417 if (size == 1 && *u < 256)
1418 return get_latin1_char((unsigned char)*u);
1419
1420 /* If not empty and not single character, copy the Unicode data
1421 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001422 if (find_maxchar_surrogates(u, u + size,
1423 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return NULL;
1425
1426 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1427 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428 if (!unicode)
1429 return NULL;
1430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431 switch (PyUnicode_KIND(unicode)) {
1432 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001433 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1435 break;
1436 case PyUnicode_2BYTE_KIND:
1437#if Py_UNICODE_SIZE == 2
1438 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1439#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001440 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1442#endif
1443 break;
1444 case PyUnicode_4BYTE_KIND:
1445#if SIZEOF_WCHAR_T == 2
1446 /* This is the only case which has to process surrogates, thus
1447 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001448 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449#else
1450 assert(num_surrogates == 0);
1451 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1452#endif
1453 break;
1454 default:
1455 assert(0 && "Impossible state");
1456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457
1458 return (PyObject *)unicode;
1459}
1460
Alexander Belopolsky40018472011-02-26 01:02:56 +00001461PyObject *
1462PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001463{
1464 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001465
Benjamin Peterson14339b62009-01-31 16:36:08 +00001466 if (size < 0) {
1467 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001469 return NULL;
1470 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001472 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001473 some optimizations which share commonly used objects.
1474 Also, this means the input must be UTF-8, so fall back to the
1475 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001476 if (u != NULL) {
1477
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 /* Optimization for empty strings */
1479 if (size == 0 && unicode_empty != NULL) {
1480 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001481 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001482 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001483
1484 /* Single characters are shared when using this constructor.
1485 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 if (size == 1 && Py_CHARMASK(*u) < 128)
1487 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001488
1489 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001490 }
1491
Walter Dörwald55507312007-05-18 13:12:10 +00001492 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001493 if (!unicode)
1494 return NULL;
1495
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 return (PyObject *)unicode;
1497}
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499PyObject *
1500PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001501{
1502 size_t size = strlen(u);
1503 if (size > PY_SSIZE_T_MAX) {
1504 PyErr_SetString(PyExc_OverflowError, "input too long");
1505 return NULL;
1506 }
1507
1508 return PyUnicode_FromStringAndSize(u, size);
1509}
1510
Victor Stinnere57b1c02011-09-28 22:20:48 +02001511static PyObject*
1512_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 PyObject *res;
1515 unsigned char max = 127;
1516 Py_ssize_t i;
1517 for (i = 0; i < size; i++) {
1518 if (u[i] & 0x80) {
1519 max = 255;
1520 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001521 }
1522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 res = PyUnicode_New(size, max);
1524 if (!res)
1525 return NULL;
1526 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1527 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001528}
1529
Victor Stinnere57b1c02011-09-28 22:20:48 +02001530static PyObject*
1531_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532{
1533 PyObject *res;
1534 Py_UCS2 max = 0;
1535 Py_ssize_t i;
1536 for (i = 0; i < size; i++)
1537 if (u[i] > max)
1538 max = u[i];
1539 res = PyUnicode_New(size, max);
1540 if (!res)
1541 return NULL;
1542 if (max >= 256)
1543 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1544 else
1545 for (i = 0; i < size; i++)
1546 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1547 return res;
1548}
1549
Victor Stinnere57b1c02011-09-28 22:20:48 +02001550static PyObject*
1551_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552{
1553 PyObject *res;
1554 Py_UCS4 max = 0;
1555 Py_ssize_t i;
1556 for (i = 0; i < size; i++)
1557 if (u[i] > max)
1558 max = u[i];
1559 res = PyUnicode_New(size, max);
1560 if (!res)
1561 return NULL;
1562 if (max >= 0x10000)
1563 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1564 else {
1565 int kind = PyUnicode_KIND(res);
1566 void *data = PyUnicode_DATA(res);
1567 for (i = 0; i < size; i++)
1568 PyUnicode_WRITE(kind, data, i, u[i]);
1569 }
1570 return res;
1571}
1572
1573PyObject*
1574PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1575{
1576 switch(kind) {
1577 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001578 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001580 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001582 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 }
Victor Stinner01698042011-10-04 00:04:26 +02001584 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 return NULL;
1586}
1587
Victor Stinner034f6cf2011-09-30 02:26:44 +02001588PyObject*
1589PyUnicode_Copy(PyObject *unicode)
1590{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001591 Py_ssize_t size;
1592 PyObject *copy;
1593 void *data;
1594
Victor Stinner034f6cf2011-09-30 02:26:44 +02001595 if (!PyUnicode_Check(unicode)) {
1596 PyErr_BadInternalCall();
1597 return NULL;
1598 }
1599 if (PyUnicode_READY(unicode))
1600 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001601
1602 size = PyUnicode_GET_LENGTH(unicode);
1603 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1604 if (!copy)
1605 return NULL;
1606 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1607
1608 data = PyUnicode_DATA(unicode);
1609 switch (PyUnicode_KIND(unicode))
1610 {
1611 case PyUnicode_1BYTE_KIND:
1612 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1613 break;
1614 case PyUnicode_2BYTE_KIND:
1615 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1616 break;
1617 case PyUnicode_4BYTE_KIND:
1618 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1619 break;
1620 default:
1621 assert(0);
1622 break;
1623 }
1624 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001625}
1626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627
Victor Stinnerbc603d12011-10-02 01:00:40 +02001628/* Widen Unicode objects to larger buffers. Don't write terminating null
1629 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630
1631void*
1632_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1633{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634 Py_ssize_t len;
1635 void *result;
1636 unsigned int skind;
1637
1638 if (PyUnicode_READY(s))
1639 return NULL;
1640
1641 len = PyUnicode_GET_LENGTH(s);
1642 skind = PyUnicode_KIND(s);
1643 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001644 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 return NULL;
1646 }
1647 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001648 case PyUnicode_2BYTE_KIND:
1649 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1650 if (!result)
1651 return PyErr_NoMemory();
1652 assert(skind == PyUnicode_1BYTE_KIND);
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS1, Py_UCS2,
1655 PyUnicode_1BYTE_DATA(s),
1656 PyUnicode_1BYTE_DATA(s) + len,
1657 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001659 case PyUnicode_4BYTE_KIND:
1660 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1661 if (!result)
1662 return PyErr_NoMemory();
1663 if (skind == PyUnicode_2BYTE_KIND) {
1664 _PyUnicode_CONVERT_BYTES(
1665 Py_UCS2, Py_UCS4,
1666 PyUnicode_2BYTE_DATA(s),
1667 PyUnicode_2BYTE_DATA(s) + len,
1668 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001670 else {
1671 assert(skind == PyUnicode_1BYTE_KIND);
1672 _PyUnicode_CONVERT_BYTES(
1673 Py_UCS1, Py_UCS4,
1674 PyUnicode_1BYTE_DATA(s),
1675 PyUnicode_1BYTE_DATA(s) + len,
1676 result);
1677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001679 default:
1680 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 }
Victor Stinner01698042011-10-04 00:04:26 +02001682 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 return NULL;
1684}
1685
1686static Py_UCS4*
1687as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1688 int copy_null)
1689{
1690 int kind;
1691 void *data;
1692 Py_ssize_t len, targetlen;
1693 if (PyUnicode_READY(string) == -1)
1694 return NULL;
1695 kind = PyUnicode_KIND(string);
1696 data = PyUnicode_DATA(string);
1697 len = PyUnicode_GET_LENGTH(string);
1698 targetlen = len;
1699 if (copy_null)
1700 targetlen++;
1701 if (!target) {
1702 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1703 PyErr_NoMemory();
1704 return NULL;
1705 }
1706 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1707 if (!target) {
1708 PyErr_NoMemory();
1709 return NULL;
1710 }
1711 }
1712 else {
1713 if (targetsize < targetlen) {
1714 PyErr_Format(PyExc_SystemError,
1715 "string is longer than the buffer");
1716 if (copy_null && 0 < targetsize)
1717 target[0] = 0;
1718 return NULL;
1719 }
1720 }
1721 if (kind != PyUnicode_4BYTE_KIND) {
1722 Py_ssize_t i;
1723 for (i = 0; i < len; i++)
1724 target[i] = PyUnicode_READ(kind, data, i);
1725 }
1726 else
1727 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1728 if (copy_null)
1729 target[len] = 0;
1730 return target;
1731}
1732
1733Py_UCS4*
1734PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1735 int copy_null)
1736{
1737 if (target == NULL || targetsize < 1) {
1738 PyErr_BadInternalCall();
1739 return NULL;
1740 }
1741 return as_ucs4(string, target, targetsize, copy_null);
1742}
1743
1744Py_UCS4*
1745PyUnicode_AsUCS4Copy(PyObject *string)
1746{
1747 return as_ucs4(string, NULL, 0, 1);
1748}
1749
1750#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001751
Alexander Belopolsky40018472011-02-26 01:02:56 +00001752PyObject *
1753PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001756 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001758 PyErr_BadInternalCall();
1759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 }
1761
Martin v. Löwis790465f2008-04-05 20:41:37 +00001762 if (size == -1) {
1763 size = wcslen(w);
1764 }
1765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767}
1768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001770
Walter Dörwald346737f2007-05-31 10:44:43 +00001771static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001772makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1773 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001775 *fmt++ = '%';
1776 if (width) {
1777 if (zeropad)
1778 *fmt++ = '0';
1779 fmt += sprintf(fmt, "%d", width);
1780 }
1781 if (precision)
1782 fmt += sprintf(fmt, ".%d", precision);
1783 if (longflag)
1784 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001785 else if (longlongflag) {
1786 /* longlongflag should only ever be nonzero on machines with
1787 HAVE_LONG_LONG defined */
1788#ifdef HAVE_LONG_LONG
1789 char *f = PY_FORMAT_LONG_LONG;
1790 while (*f)
1791 *fmt++ = *f++;
1792#else
1793 /* we shouldn't ever get here */
1794 assert(0);
1795 *fmt++ = 'l';
1796#endif
1797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 else if (size_tflag) {
1799 char *f = PY_FORMAT_SIZE_T;
1800 while (*f)
1801 *fmt++ = *f++;
1802 }
1803 *fmt++ = c;
1804 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001805}
1806
Victor Stinner96865452011-03-01 23:44:09 +00001807/* helper for PyUnicode_FromFormatV() */
1808
1809static const char*
1810parse_format_flags(const char *f,
1811 int *p_width, int *p_precision,
1812 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1813{
1814 int width, precision, longflag, longlongflag, size_tflag;
1815
1816 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1817 f++;
1818 width = 0;
1819 while (Py_ISDIGIT((unsigned)*f))
1820 width = (width*10) + *f++ - '0';
1821 precision = 0;
1822 if (*f == '.') {
1823 f++;
1824 while (Py_ISDIGIT((unsigned)*f))
1825 precision = (precision*10) + *f++ - '0';
1826 if (*f == '%') {
1827 /* "%.3%s" => f points to "3" */
1828 f--;
1829 }
1830 }
1831 if (*f == '\0') {
1832 /* bogus format "%.1" => go backward, f points to "1" */
1833 f--;
1834 }
1835 if (p_width != NULL)
1836 *p_width = width;
1837 if (p_precision != NULL)
1838 *p_precision = precision;
1839
1840 /* Handle %ld, %lu, %lld and %llu. */
1841 longflag = 0;
1842 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001843 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001844
1845 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001846 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001847 longflag = 1;
1848 ++f;
1849 }
1850#ifdef HAVE_LONG_LONG
1851 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001853 longlongflag = 1;
1854 f += 2;
1855 }
1856#endif
1857 }
1858 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001859 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001860 size_tflag = 1;
1861 ++f;
1862 }
1863 if (p_longflag != NULL)
1864 *p_longflag = longflag;
1865 if (p_longlongflag != NULL)
1866 *p_longlongflag = longlongflag;
1867 if (p_size_tflag != NULL)
1868 *p_size_tflag = size_tflag;
1869 return f;
1870}
1871
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001872/* maximum number of characters required for output of %ld. 21 characters
1873 allows for 64-bit integers (in decimal) and an optional sign. */
1874#define MAX_LONG_CHARS 21
1875/* maximum number of characters required for output of %lld.
1876 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1877 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1878#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1879
Walter Dörwaldd2034312007-05-18 16:29:38 +00001880PyObject *
1881PyUnicode_FromFormatV(const char *format, va_list vargs)
1882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001883 va_list count;
1884 Py_ssize_t callcount = 0;
1885 PyObject **callresults = NULL;
1886 PyObject **callresult = NULL;
1887 Py_ssize_t n = 0;
1888 int width = 0;
1889 int precision = 0;
1890 int zeropad;
1891 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001893 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001894 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1896 Py_UCS4 argmaxchar;
1897 Py_ssize_t numbersize = 0;
1898 char *numberresults = NULL;
1899 char *numberresult = NULL;
1900 Py_ssize_t i;
1901 int kind;
1902 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001903
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001904 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001905 /* step 1: count the number of %S/%R/%A/%s format specifications
1906 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1907 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 * result in an array)
1909 * also esimate a upper bound for all the number formats in the string,
1910 * numbers will be formated in step 3 and be keept in a '\0'-separated
1911 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 for (f = format; *f; f++) {
1913 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001914 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1916 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1917 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1918 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001921#ifdef HAVE_LONG_LONG
1922 if (longlongflag) {
1923 if (width < MAX_LONG_LONG_CHARS)
1924 width = MAX_LONG_LONG_CHARS;
1925 }
1926 else
1927#endif
1928 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1929 including sign. Decimal takes the most space. This
1930 isn't enough for octal. If a width is specified we
1931 need more (which we allocate later). */
1932 if (width < MAX_LONG_CHARS)
1933 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934
1935 /* account for the size + '\0' to separate numbers
1936 inside of the numberresults buffer */
1937 numbersize += (width + 1);
1938 }
1939 }
1940 else if ((unsigned char)*f > 127) {
1941 PyErr_Format(PyExc_ValueError,
1942 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1943 "string, got a non-ASCII byte: 0x%02x",
1944 (unsigned char)*f);
1945 return NULL;
1946 }
1947 }
1948 /* step 2: allocate memory for the results of
1949 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1950 if (callcount) {
1951 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1952 if (!callresults) {
1953 PyErr_NoMemory();
1954 return NULL;
1955 }
1956 callresult = callresults;
1957 }
1958 /* step 2.5: allocate memory for the results of formating numbers */
1959 if (numbersize) {
1960 numberresults = PyObject_Malloc(numbersize);
1961 if (!numberresults) {
1962 PyErr_NoMemory();
1963 goto fail;
1964 }
1965 numberresult = numberresults;
1966 }
1967
1968 /* step 3: format numbers and figure out how large a buffer we need */
1969 for (f = format; *f; f++) {
1970 if (*f == '%') {
1971 const char* p;
1972 int longflag;
1973 int longlongflag;
1974 int size_tflag;
1975 int numprinted;
1976
1977 p = f;
1978 zeropad = (f[1] == '0');
1979 f = parse_format_flags(f, &width, &precision,
1980 &longflag, &longlongflag, &size_tflag);
1981 switch (*f) {
1982 case 'c':
1983 {
1984 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001985 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 n++;
1987 break;
1988 }
1989 case '%':
1990 n++;
1991 break;
1992 case 'i':
1993 case 'd':
1994 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1995 width, precision, *f);
1996 if (longflag)
1997 numprinted = sprintf(numberresult, fmt,
1998 va_arg(count, long));
1999#ifdef HAVE_LONG_LONG
2000 else if (longlongflag)
2001 numprinted = sprintf(numberresult, fmt,
2002 va_arg(count, PY_LONG_LONG));
2003#endif
2004 else if (size_tflag)
2005 numprinted = sprintf(numberresult, fmt,
2006 va_arg(count, Py_ssize_t));
2007 else
2008 numprinted = sprintf(numberresult, fmt,
2009 va_arg(count, int));
2010 n += numprinted;
2011 /* advance by +1 to skip over the '\0' */
2012 numberresult += (numprinted + 1);
2013 assert(*(numberresult - 1) == '\0');
2014 assert(*(numberresult - 2) != '\0');
2015 assert(numprinted >= 0);
2016 assert(numberresult <= numberresults + numbersize);
2017 break;
2018 case 'u':
2019 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2020 width, precision, 'u');
2021 if (longflag)
2022 numprinted = sprintf(numberresult, fmt,
2023 va_arg(count, unsigned long));
2024#ifdef HAVE_LONG_LONG
2025 else if (longlongflag)
2026 numprinted = sprintf(numberresult, fmt,
2027 va_arg(count, unsigned PY_LONG_LONG));
2028#endif
2029 else if (size_tflag)
2030 numprinted = sprintf(numberresult, fmt,
2031 va_arg(count, size_t));
2032 else
2033 numprinted = sprintf(numberresult, fmt,
2034 va_arg(count, unsigned int));
2035 n += numprinted;
2036 numberresult += (numprinted + 1);
2037 assert(*(numberresult - 1) == '\0');
2038 assert(*(numberresult - 2) != '\0');
2039 assert(numprinted >= 0);
2040 assert(numberresult <= numberresults + numbersize);
2041 break;
2042 case 'x':
2043 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2044 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2045 n += numprinted;
2046 numberresult += (numprinted + 1);
2047 assert(*(numberresult - 1) == '\0');
2048 assert(*(numberresult - 2) != '\0');
2049 assert(numprinted >= 0);
2050 assert(numberresult <= numberresults + numbersize);
2051 break;
2052 case 'p':
2053 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2054 /* %p is ill-defined: ensure leading 0x. */
2055 if (numberresult[1] == 'X')
2056 numberresult[1] = 'x';
2057 else if (numberresult[1] != 'x') {
2058 memmove(numberresult + 2, numberresult,
2059 strlen(numberresult) + 1);
2060 numberresult[0] = '0';
2061 numberresult[1] = 'x';
2062 numprinted += 2;
2063 }
2064 n += numprinted;
2065 numberresult += (numprinted + 1);
2066 assert(*(numberresult - 1) == '\0');
2067 assert(*(numberresult - 2) != '\0');
2068 assert(numprinted >= 0);
2069 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002070 break;
2071 case 's':
2072 {
2073 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002074 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002075 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2076 if (!str)
2077 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002078 /* since PyUnicode_DecodeUTF8 returns already flexible
2079 unicode objects, there is no need to call ready on them */
2080 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002081 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002083 /* Remember the str and switch to the next slot */
2084 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002085 break;
2086 }
2087 case 'U':
2088 {
2089 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002090 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (PyUnicode_READY(obj) == -1)
2092 goto fail;
2093 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002094 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002096 break;
2097 }
2098 case 'V':
2099 {
2100 PyObject *obj = va_arg(count, PyObject *);
2101 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002102 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002104 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002105 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 if (PyUnicode_READY(obj) == -1)
2107 goto fail;
2108 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002109 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002111 *callresult++ = NULL;
2112 }
2113 else {
2114 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2115 if (!str_obj)
2116 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002118 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002120 *callresult++ = str_obj;
2121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002122 break;
2123 }
2124 case 'S':
2125 {
2126 PyObject *obj = va_arg(count, PyObject *);
2127 PyObject *str;
2128 assert(obj);
2129 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002131 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002133 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002135 /* Remember the str and switch to the next slot */
2136 *callresult++ = str;
2137 break;
2138 }
2139 case 'R':
2140 {
2141 PyObject *obj = va_arg(count, PyObject *);
2142 PyObject *repr;
2143 assert(obj);
2144 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002146 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002148 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002150 /* Remember the repr and switch to the next slot */
2151 *callresult++ = repr;
2152 break;
2153 }
2154 case 'A':
2155 {
2156 PyObject *obj = va_arg(count, PyObject *);
2157 PyObject *ascii;
2158 assert(obj);
2159 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002163 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002165 /* Remember the repr and switch to the next slot */
2166 *callresult++ = ascii;
2167 break;
2168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 default:
2170 /* if we stumble upon an unknown
2171 formatting code, copy the rest of
2172 the format string to the output
2173 string. (we cannot just skip the
2174 code, since there's no way to know
2175 what's in the argument list) */
2176 n += strlen(p);
2177 goto expand;
2178 }
2179 } else
2180 n++;
2181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002182 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 we don't have to resize the string.
2186 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 if (!string)
2189 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 kind = PyUnicode_KIND(string);
2191 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002192 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002197 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002198
2199 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2201 /* checking for == because the last argument could be a empty
2202 string, which causes i to point to end, the assert at the end of
2203 the loop */
2204 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002205
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 switch (*f) {
2207 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002208 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 const int ordinal = va_arg(vargs, int);
2210 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002212 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002213 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002215 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002216 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 case 'p':
2218 /* unused, since we already have the result */
2219 if (*f == 'p')
2220 (void) va_arg(vargs, void *);
2221 else
2222 (void) va_arg(vargs, int);
2223 /* extract the result from numberresults and append. */
2224 for (; *numberresult; ++i, ++numberresult)
2225 PyUnicode_WRITE(kind, data, i, *numberresult);
2226 /* skip over the separating '\0' */
2227 assert(*numberresult == '\0');
2228 numberresult++;
2229 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002230 break;
2231 case 's':
2232 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002233 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002235 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 size = PyUnicode_GET_LENGTH(*callresult);
2237 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002238 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2239 *callresult, 0,
2240 size) < 0)
2241 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002243 /* We're done with the unicode()/repr() => forget it */
2244 Py_DECREF(*callresult);
2245 /* switch to next unicode()/repr() result */
2246 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 break;
2248 }
2249 case 'U':
2250 {
2251 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_ssize_t size;
2253 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2254 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002255 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2256 obj, 0,
2257 size) < 0)
2258 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 break;
2261 }
2262 case 'V':
2263 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002265 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002266 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 size = PyUnicode_GET_LENGTH(obj);
2269 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002270 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2271 obj, 0,
2272 size) < 0)
2273 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002275 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 size = PyUnicode_GET_LENGTH(*callresult);
2277 assert(PyUnicode_KIND(*callresult) <=
2278 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002279 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2280 *callresult,
2281 0, size) < 0)
2282 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002284 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002286 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 break;
2288 }
2289 case 'S':
2290 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002291 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 /* unused, since we already have the result */
2294 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002296 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2297 *callresult, 0,
2298 PyUnicode_GET_LENGTH(*callresult)) < 0)
2299 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002301 /* We're done with the unicode()/repr() => forget it */
2302 Py_DECREF(*callresult);
2303 /* switch to next unicode()/repr() result */
2304 ++callresult;
2305 break;
2306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002309 break;
2310 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 for (; *p; ++p, ++i)
2312 PyUnicode_WRITE(kind, data, i, *p);
2313 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 goto end;
2315 }
Victor Stinner1205f272010-09-11 00:54:47 +00002316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 else {
2318 assert(i < PyUnicode_GET_LENGTH(string));
2319 PyUnicode_WRITE(kind, data, i++, *f);
2320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002323
Benjamin Peterson29060642009-01-31 22:14:21 +00002324 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 if (callresults)
2326 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 if (numberresults)
2328 PyObject_Free(numberresults);
2329 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002330 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 if (callresults) {
2332 PyObject **callresult2 = callresults;
2333 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002334 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 ++callresult2;
2336 }
2337 PyObject_Free(callresults);
2338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339 if (numberresults)
2340 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002342}
2343
Walter Dörwaldd2034312007-05-18 16:29:38 +00002344PyObject *
2345PyUnicode_FromFormat(const char *format, ...)
2346{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 PyObject* ret;
2348 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002349
2350#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002351 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002352#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002354#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 ret = PyUnicode_FromFormatV(format, vargs);
2356 va_end(vargs);
2357 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002358}
2359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360#ifdef HAVE_WCHAR_H
2361
Victor Stinner5593d8a2010-10-02 11:11:27 +00002362/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2363 convert a Unicode object to a wide character string.
2364
Victor Stinnerd88d9832011-09-06 02:00:05 +02002365 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002366 character) required to convert the unicode object. Ignore size argument.
2367
Victor Stinnerd88d9832011-09-06 02:00:05 +02002368 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002369 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002370 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002371static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002372unicode_aswidechar(PyUnicodeObject *unicode,
2373 wchar_t *w,
2374 Py_ssize_t size)
2375{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002376 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 const wchar_t *wstr;
2378
2379 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2380 if (wstr == NULL)
2381 return -1;
2382
Victor Stinner5593d8a2010-10-02 11:11:27 +00002383 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002384 if (size > res)
2385 size = res + 1;
2386 else
2387 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002389 return res;
2390 }
2391 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002393}
2394
2395Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002396PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002397 wchar_t *w,
2398 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
2400 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 PyErr_BadInternalCall();
2402 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002404 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405}
2406
Victor Stinner137c34c2010-09-29 10:25:54 +00002407wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002408PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002409 Py_ssize_t *size)
2410{
2411 wchar_t* buffer;
2412 Py_ssize_t buflen;
2413
2414 if (unicode == NULL) {
2415 PyErr_BadInternalCall();
2416 return NULL;
2417 }
2418
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002419 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 if (buflen == -1)
2421 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002422 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002423 PyErr_NoMemory();
2424 return NULL;
2425 }
2426
Victor Stinner137c34c2010-09-29 10:25:54 +00002427 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2428 if (buffer == NULL) {
2429 PyErr_NoMemory();
2430 return NULL;
2431 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002432 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (buflen == -1)
2434 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002435 if (size != NULL)
2436 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002437 return buffer;
2438}
2439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441
Alexander Belopolsky40018472011-02-26 01:02:56 +00002442PyObject *
2443PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002446 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 PyErr_SetString(PyExc_ValueError,
2448 "chr() arg not in range(0x110000)");
2449 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002450 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 if (ordinal < 256)
2453 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 v = PyUnicode_New(1, ordinal);
2456 if (v == NULL)
2457 return NULL;
2458 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2459 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002460}
2461
Alexander Belopolsky40018472011-02-26 01:02:56 +00002462PyObject *
2463PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002465 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002467 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002468 if (PyUnicode_READY(obj))
2469 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002470 Py_INCREF(obj);
2471 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002472 }
2473 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002474 /* For a Unicode subtype that's not a Unicode object,
2475 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002476 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002477 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002478 PyErr_Format(PyExc_TypeError,
2479 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002480 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002481 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002482}
2483
Alexander Belopolsky40018472011-02-26 01:02:56 +00002484PyObject *
2485PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002486 const char *encoding,
2487 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002488{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002489 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002490 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002491
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002496
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002497 /* Decoding bytes objects is the most common case and should be fast */
2498 if (PyBytes_Check(obj)) {
2499 if (PyBytes_GET_SIZE(obj) == 0) {
2500 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002501 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002502 }
2503 else {
2504 v = PyUnicode_Decode(
2505 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2506 encoding, errors);
2507 }
2508 return v;
2509 }
2510
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002511 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 PyErr_SetString(PyExc_TypeError,
2513 "decoding str is not supported");
2514 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002516
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002517 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2518 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2519 PyErr_Format(PyExc_TypeError,
2520 "coercing to str: need bytes, bytearray "
2521 "or buffer-like object, %.80s found",
2522 Py_TYPE(obj)->tp_name);
2523 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002524 }
Tim Petersced69f82003-09-16 20:30:58 +00002525
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002526 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002527 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002528 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 }
Tim Petersced69f82003-09-16 20:30:58 +00002530 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002531 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002532
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002533 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002534 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535}
2536
Victor Stinner600d3be2010-06-10 12:00:55 +00002537/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002538 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2539 1 on success. */
2540static int
2541normalize_encoding(const char *encoding,
2542 char *lower,
2543 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002545 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002546 char *l;
2547 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002549 e = encoding;
2550 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002551 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002552 while (*e) {
2553 if (l == l_end)
2554 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002555 if (Py_ISUPPER(*e)) {
2556 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002557 }
2558 else if (*e == '_') {
2559 *l++ = '-';
2560 e++;
2561 }
2562 else {
2563 *l++ = *e++;
2564 }
2565 }
2566 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002567 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002568}
2569
Alexander Belopolsky40018472011-02-26 01:02:56 +00002570PyObject *
2571PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002572 Py_ssize_t size,
2573 const char *encoding,
2574 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002575{
2576 PyObject *buffer = NULL, *unicode;
2577 Py_buffer info;
2578 char lower[11]; /* Enough for any encoding shortcut */
2579
2580 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002581 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002582
2583 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002584 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002585 if ((strcmp(lower, "utf-8") == 0) ||
2586 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002587 return PyUnicode_DecodeUTF8(s, size, errors);
2588 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002589 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002590 (strcmp(lower, "iso-8859-1") == 0))
2591 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002592#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002593 else if (strcmp(lower, "mbcs") == 0)
2594 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002595#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002596 else if (strcmp(lower, "ascii") == 0)
2597 return PyUnicode_DecodeASCII(s, size, errors);
2598 else if (strcmp(lower, "utf-16") == 0)
2599 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2600 else if (strcmp(lower, "utf-32") == 0)
2601 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
2604 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002605 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002606 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002607 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002608 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 if (buffer == NULL)
2610 goto onError;
2611 unicode = PyCodec_Decode(buffer, encoding, errors);
2612 if (unicode == NULL)
2613 goto onError;
2614 if (!PyUnicode_Check(unicode)) {
2615 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002616 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002617 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 Py_DECREF(unicode);
2619 goto onError;
2620 }
2621 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002622 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 Py_DECREF(unicode);
2624 return NULL;
2625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002627
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 Py_XDECREF(buffer);
2630 return NULL;
2631}
2632
Alexander Belopolsky40018472011-02-26 01:02:56 +00002633PyObject *
2634PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002635 const char *encoding,
2636 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002637{
2638 PyObject *v;
2639
2640 if (!PyUnicode_Check(unicode)) {
2641 PyErr_BadArgument();
2642 goto onError;
2643 }
2644
2645 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002646 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002647
2648 /* Decode via the codec registry */
2649 v = PyCodec_Decode(unicode, encoding, errors);
2650 if (v == NULL)
2651 goto onError;
2652 return v;
2653
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002655 return NULL;
2656}
2657
Alexander Belopolsky40018472011-02-26 01:02:56 +00002658PyObject *
2659PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002660 const char *encoding,
2661 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002662{
2663 PyObject *v;
2664
2665 if (!PyUnicode_Check(unicode)) {
2666 PyErr_BadArgument();
2667 goto onError;
2668 }
2669
2670 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002672
2673 /* Decode via the codec registry */
2674 v = PyCodec_Decode(unicode, encoding, errors);
2675 if (v == NULL)
2676 goto onError;
2677 if (!PyUnicode_Check(v)) {
2678 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002679 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002680 Py_TYPE(v)->tp_name);
2681 Py_DECREF(v);
2682 goto onError;
2683 }
2684 return v;
2685
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002687 return NULL;
2688}
2689
Alexander Belopolsky40018472011-02-26 01:02:56 +00002690PyObject *
2691PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002692 Py_ssize_t size,
2693 const char *encoding,
2694 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695{
2696 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 unicode = PyUnicode_FromUnicode(s, size);
2699 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2702 Py_DECREF(unicode);
2703 return v;
2704}
2705
Alexander Belopolsky40018472011-02-26 01:02:56 +00002706PyObject *
2707PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002708 const char *encoding,
2709 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002710{
2711 PyObject *v;
2712
2713 if (!PyUnicode_Check(unicode)) {
2714 PyErr_BadArgument();
2715 goto onError;
2716 }
2717
2718 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002720
2721 /* Encode via the codec registry */
2722 v = PyCodec_Encode(unicode, encoding, errors);
2723 if (v == NULL)
2724 goto onError;
2725 return v;
2726
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002728 return NULL;
2729}
2730
Victor Stinnerad158722010-10-27 00:25:46 +00002731PyObject *
2732PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002733{
Victor Stinner99b95382011-07-04 14:23:54 +02002734#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002735 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 NULL);
2738#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002740#else
Victor Stinner793b5312011-04-27 00:24:21 +02002741 PyInterpreterState *interp = PyThreadState_GET()->interp;
2742 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2743 cannot use it to encode and decode filenames before it is loaded. Load
2744 the Python codec requires to encode at least its own filename. Use the C
2745 version of the locale codec until the codec registry is initialized and
2746 the Python codec is loaded.
2747
2748 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2749 cannot only rely on it: check also interp->fscodec_initialized for
2750 subinterpreters. */
2751 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002752 return PyUnicode_AsEncodedString(unicode,
2753 Py_FileSystemDefaultEncoding,
2754 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002755 }
2756 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002757 /* locale encoding with surrogateescape */
2758 wchar_t *wchar;
2759 char *bytes;
2760 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002761 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002762
2763 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2764 if (wchar == NULL)
2765 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002766 bytes = _Py_wchar2char(wchar, &error_pos);
2767 if (bytes == NULL) {
2768 if (error_pos != (size_t)-1) {
2769 char *errmsg = strerror(errno);
2770 PyObject *exc = NULL;
2771 if (errmsg == NULL)
2772 errmsg = "Py_wchar2char() failed";
2773 raise_encode_exception(&exc,
2774 "filesystemencoding",
2775 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2776 error_pos, error_pos+1,
2777 errmsg);
2778 Py_XDECREF(exc);
2779 }
2780 else
2781 PyErr_NoMemory();
2782 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002783 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002784 }
2785 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002786
2787 bytes_obj = PyBytes_FromString(bytes);
2788 PyMem_Free(bytes);
2789 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002790 }
Victor Stinnerad158722010-10-27 00:25:46 +00002791#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002792}
2793
Alexander Belopolsky40018472011-02-26 01:02:56 +00002794PyObject *
2795PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002796 const char *encoding,
2797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798{
2799 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002800 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002801
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 if (!PyUnicode_Check(unicode)) {
2803 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 }
Fred Drakee4315f52000-05-09 19:53:39 +00002806
Victor Stinner2f283c22011-03-02 01:21:46 +00002807 if (encoding == NULL) {
2808 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002810 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002812 }
Fred Drakee4315f52000-05-09 19:53:39 +00002813
2814 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002815 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002816 if ((strcmp(lower, "utf-8") == 0) ||
2817 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002818 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002819 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002821 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002823 }
Victor Stinner37296e82010-06-10 13:36:23 +00002824 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002825 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002826 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002828#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002829 else if (strcmp(lower, "mbcs") == 0)
2830 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2831 PyUnicode_GET_SIZE(unicode),
2832 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002833#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002834 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837
2838 /* Encode via the codec registry */
2839 v = PyCodec_Encode(unicode, encoding, errors);
2840 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002841 return NULL;
2842
2843 /* The normal path */
2844 if (PyBytes_Check(v))
2845 return v;
2846
2847 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002848 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002849 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002850 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002851
2852 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2853 "encoder %s returned bytearray instead of bytes",
2854 encoding);
2855 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002856 Py_DECREF(v);
2857 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002858 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002859
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002860 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2861 Py_DECREF(v);
2862 return b;
2863 }
2864
2865 PyErr_Format(PyExc_TypeError,
2866 "encoder did not return a bytes object (type=%.400s)",
2867 Py_TYPE(v)->tp_name);
2868 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002869 return NULL;
2870}
2871
Alexander Belopolsky40018472011-02-26 01:02:56 +00002872PyObject *
2873PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002874 const char *encoding,
2875 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002876{
2877 PyObject *v;
2878
2879 if (!PyUnicode_Check(unicode)) {
2880 PyErr_BadArgument();
2881 goto onError;
2882 }
2883
2884 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002886
2887 /* Encode via the codec registry */
2888 v = PyCodec_Encode(unicode, encoding, errors);
2889 if (v == NULL)
2890 goto onError;
2891 if (!PyUnicode_Check(v)) {
2892 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002893 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002894 Py_TYPE(v)->tp_name);
2895 Py_DECREF(v);
2896 goto onError;
2897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002899
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 return NULL;
2902}
2903
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002904PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002905PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002906 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002907 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2908}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002909
Christian Heimes5894ba72007-11-04 11:43:14 +00002910PyObject*
2911PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2912{
Victor Stinner99b95382011-07-04 14:23:54 +02002913#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002914 return PyUnicode_DecodeMBCS(s, size, NULL);
2915#elif defined(__APPLE__)
2916 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2917#else
Victor Stinner793b5312011-04-27 00:24:21 +02002918 PyInterpreterState *interp = PyThreadState_GET()->interp;
2919 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2920 cannot use it to encode and decode filenames before it is loaded. Load
2921 the Python codec requires to encode at least its own filename. Use the C
2922 version of the locale codec until the codec registry is initialized and
2923 the Python codec is loaded.
2924
2925 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2926 cannot only rely on it: check also interp->fscodec_initialized for
2927 subinterpreters. */
2928 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002929 return PyUnicode_Decode(s, size,
2930 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002931 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002932 }
2933 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002934 /* locale encoding with surrogateescape */
2935 wchar_t *wchar;
2936 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002937 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002938
2939 if (s[size] != '\0' || size != strlen(s)) {
2940 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2941 return NULL;
2942 }
2943
Victor Stinner168e1172010-10-16 23:16:16 +00002944 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002945 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002946 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002947
Victor Stinner168e1172010-10-16 23:16:16 +00002948 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002949 PyMem_Free(wchar);
2950 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002951 }
Victor Stinnerad158722010-10-27 00:25:46 +00002952#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002953}
2954
Martin v. Löwis011e8422009-05-05 04:43:17 +00002955
2956int
2957PyUnicode_FSConverter(PyObject* arg, void* addr)
2958{
2959 PyObject *output = NULL;
2960 Py_ssize_t size;
2961 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002962 if (arg == NULL) {
2963 Py_DECREF(*(PyObject**)addr);
2964 return 1;
2965 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002966 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002967 output = arg;
2968 Py_INCREF(output);
2969 }
2970 else {
2971 arg = PyUnicode_FromObject(arg);
2972 if (!arg)
2973 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002974 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002975 Py_DECREF(arg);
2976 if (!output)
2977 return 0;
2978 if (!PyBytes_Check(output)) {
2979 Py_DECREF(output);
2980 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2981 return 0;
2982 }
2983 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002984 size = PyBytes_GET_SIZE(output);
2985 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002986 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002987 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002988 Py_DECREF(output);
2989 return 0;
2990 }
2991 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002992 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002993}
2994
2995
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002996int
2997PyUnicode_FSDecoder(PyObject* arg, void* addr)
2998{
2999 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003000 if (arg == NULL) {
3001 Py_DECREF(*(PyObject**)addr);
3002 return 1;
3003 }
3004 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 if (PyUnicode_READY(arg))
3006 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003007 output = arg;
3008 Py_INCREF(output);
3009 }
3010 else {
3011 arg = PyBytes_FromObject(arg);
3012 if (!arg)
3013 return 0;
3014 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3015 PyBytes_GET_SIZE(arg));
3016 Py_DECREF(arg);
3017 if (!output)
3018 return 0;
3019 if (!PyUnicode_Check(output)) {
3020 Py_DECREF(output);
3021 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3022 return 0;
3023 }
3024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003025 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3026 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003027 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3028 Py_DECREF(output);
3029 return 0;
3030 }
3031 *(PyObject**)addr = output;
3032 return Py_CLEANUP_SUPPORTED;
3033}
3034
3035
Martin v. Löwis5b222132007-06-10 09:51:05 +00003036char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003038{
Christian Heimesf3863112007-11-22 07:46:41 +00003039 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003040 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3041
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003046 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003047 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003048
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003049 if (PyUnicode_UTF8(unicode) == NULL) {
3050 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003051 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3052 if (bytes == NULL)
3053 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003054 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3055 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003056 Py_DECREF(bytes);
3057 return NULL;
3058 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003059 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3060 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003061 Py_DECREF(bytes);
3062 }
3063
3064 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003065 *psize = PyUnicode_UTF8_LENGTH(unicode);
3066 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003067}
3068
3069char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003072 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3073}
3074
3075#ifdef Py_DEBUG
3076int unicode_as_unicode_calls = 0;
3077#endif
3078
3079
3080Py_UNICODE *
3081PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3082{
3083 PyUnicodeObject *u;
3084 const unsigned char *one_byte;
3085#if SIZEOF_WCHAR_T == 4
3086 const Py_UCS2 *two_bytes;
3087#else
3088 const Py_UCS4 *four_bytes;
3089 const Py_UCS4 *ucs4_end;
3090 Py_ssize_t num_surrogates;
3091#endif
3092 wchar_t *w;
3093 wchar_t *wchar_end;
3094
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 return NULL;
3098 }
3099 u = (PyUnicodeObject*)unicode;
3100 if (_PyUnicode_WSTR(u) == NULL) {
3101 /* Non-ASCII compact unicode object */
3102 assert(_PyUnicode_KIND(u) != 0);
3103 assert(PyUnicode_IS_READY(u));
3104
3105#ifdef Py_DEBUG
3106 ++unicode_as_unicode_calls;
3107#endif
3108
3109 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3110#if SIZEOF_WCHAR_T == 2
3111 four_bytes = PyUnicode_4BYTE_DATA(u);
3112 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3113 num_surrogates = 0;
3114
3115 for (; four_bytes < ucs4_end; ++four_bytes) {
3116 if (*four_bytes > 0xFFFF)
3117 ++num_surrogates;
3118 }
3119
3120 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3121 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3122 if (!_PyUnicode_WSTR(u)) {
3123 PyErr_NoMemory();
3124 return NULL;
3125 }
3126 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3127
3128 w = _PyUnicode_WSTR(u);
3129 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3130 four_bytes = PyUnicode_4BYTE_DATA(u);
3131 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3132 if (*four_bytes > 0xFFFF) {
3133 /* encode surrogate pair in this case */
3134 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3135 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3136 }
3137 else
3138 *w = *four_bytes;
3139
3140 if (w > wchar_end) {
3141 assert(0 && "Miscalculated string end");
3142 }
3143 }
3144 *w = 0;
3145#else
3146 /* sizeof(wchar_t) == 4 */
3147 Py_FatalError("Impossible unicode object state, wstr and str "
3148 "should share memory already.");
3149 return NULL;
3150#endif
3151 }
3152 else {
3153 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3154 (_PyUnicode_LENGTH(u) + 1));
3155 if (!_PyUnicode_WSTR(u)) {
3156 PyErr_NoMemory();
3157 return NULL;
3158 }
3159 if (!PyUnicode_IS_COMPACT_ASCII(u))
3160 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3161 w = _PyUnicode_WSTR(u);
3162 wchar_end = w + _PyUnicode_LENGTH(u);
3163
3164 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3165 one_byte = PyUnicode_1BYTE_DATA(u);
3166 for (; w < wchar_end; ++one_byte, ++w)
3167 *w = *one_byte;
3168 /* null-terminate the wstr */
3169 *w = 0;
3170 }
3171 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3172#if SIZEOF_WCHAR_T == 4
3173 two_bytes = PyUnicode_2BYTE_DATA(u);
3174 for (; w < wchar_end; ++two_bytes, ++w)
3175 *w = *two_bytes;
3176 /* null-terminate the wstr */
3177 *w = 0;
3178#else
3179 /* sizeof(wchar_t) == 2 */
3180 PyObject_FREE(_PyUnicode_WSTR(u));
3181 _PyUnicode_WSTR(u) = NULL;
3182 Py_FatalError("Impossible unicode object state, wstr "
3183 "and str should share memory already.");
3184 return NULL;
3185#endif
3186 }
3187 else {
3188 assert(0 && "This should never happen.");
3189 }
3190 }
3191 }
3192 if (size != NULL)
3193 *size = PyUnicode_WSTR_LENGTH(u);
3194 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197Py_UNICODE *
3198PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003200 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201}
3202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204Py_ssize_t
3205PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206{
3207 if (!PyUnicode_Check(unicode)) {
3208 PyErr_BadArgument();
3209 goto onError;
3210 }
3211 return PyUnicode_GET_SIZE(unicode);
3212
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 return -1;
3215}
3216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003217Py_ssize_t
3218PyUnicode_GetLength(PyObject *unicode)
3219{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003220 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 PyErr_BadArgument();
3222 return -1;
3223 }
3224
3225 return PyUnicode_GET_LENGTH(unicode);
3226}
3227
3228Py_UCS4
3229PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3230{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003231 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3232 PyErr_BadArgument();
3233 return (Py_UCS4)-1;
3234 }
3235 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3236 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003237 return (Py_UCS4)-1;
3238 }
3239 return PyUnicode_READ_CHAR(unicode, index);
3240}
3241
3242int
3243PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3244{
3245 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003246 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003247 return -1;
3248 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003249 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3250 PyErr_SetString(PyExc_IndexError, "string index out of range");
3251 return -1;
3252 }
3253 if (_PyUnicode_Dirty(unicode))
3254 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3256 index, ch);
3257 return 0;
3258}
3259
Alexander Belopolsky40018472011-02-26 01:02:56 +00003260const char *
3261PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003262{
Victor Stinner42cb4622010-09-01 19:39:01 +00003263 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003264}
3265
Victor Stinner554f3f02010-06-16 23:33:54 +00003266/* create or adjust a UnicodeDecodeError */
3267static void
3268make_decode_exception(PyObject **exceptionObject,
3269 const char *encoding,
3270 const char *input, Py_ssize_t length,
3271 Py_ssize_t startpos, Py_ssize_t endpos,
3272 const char *reason)
3273{
3274 if (*exceptionObject == NULL) {
3275 *exceptionObject = PyUnicodeDecodeError_Create(
3276 encoding, input, length, startpos, endpos, reason);
3277 }
3278 else {
3279 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3280 goto onError;
3281 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3282 goto onError;
3283 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3284 goto onError;
3285 }
3286 return;
3287
3288onError:
3289 Py_DECREF(*exceptionObject);
3290 *exceptionObject = NULL;
3291}
3292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293/* error handling callback helper:
3294 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003295 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 and adjust various state variables.
3297 return 0 on success, -1 on error
3298*/
3299
Alexander Belopolsky40018472011-02-26 01:02:56 +00003300static int
3301unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003302 const char *encoding, const char *reason,
3303 const char **input, const char **inend, Py_ssize_t *startinpos,
3304 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3305 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003307 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308
3309 PyObject *restuple = NULL;
3310 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003312 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003313 Py_ssize_t requiredsize;
3314 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003316 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 int res = -1;
3319
3320 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 *errorHandler = PyCodec_LookupError(errors);
3322 if (*errorHandler == NULL)
3323 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 }
3325
Victor Stinner554f3f02010-06-16 23:33:54 +00003326 make_decode_exception(exceptionObject,
3327 encoding,
3328 *input, *inend - *input,
3329 *startinpos, *endinpos,
3330 reason);
3331 if (*exceptionObject == NULL)
3332 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333
3334 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3335 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003338 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 }
3341 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003343
3344 /* Copy back the bytes variables, which might have been modified by the
3345 callback */
3346 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3347 if (!inputobj)
3348 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003349 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003350 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003351 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003352 *input = PyBytes_AS_STRING(inputobj);
3353 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003354 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003355 /* we can DECREF safely, as the exception has another reference,
3356 so the object won't go away. */
3357 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003361 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3363 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003364 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365
3366 /* need more space? (at least enough for what we
3367 have+the replacement+the rest of the string (starting
3368 at the new input position), so we won't have to check space
3369 when there are no errors in the rest of the string) */
3370 repptr = PyUnicode_AS_UNICODE(repunicode);
3371 repsize = PyUnicode_GET_SIZE(repunicode);
3372 requiredsize = *outpos + repsize + insize-newpos;
3373 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 if (requiredsize<2*outsize)
3375 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003376 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 goto onError;
3378 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 }
3380 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003381 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 Py_UNICODE_COPY(*outptr, repptr, repsize);
3383 *outptr += repsize;
3384 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 /* we made it! */
3387 res = 0;
3388
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 Py_XDECREF(restuple);
3391 return res;
3392}
3393
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003394/* --- UTF-7 Codec -------------------------------------------------------- */
3395
Antoine Pitrou244651a2009-05-04 18:56:13 +00003396/* See RFC2152 for details. We encode conservatively and decode liberally. */
3397
3398/* Three simple macros defining base-64. */
3399
3400/* Is c a base-64 character? */
3401
3402#define IS_BASE64(c) \
3403 (((c) >= 'A' && (c) <= 'Z') || \
3404 ((c) >= 'a' && (c) <= 'z') || \
3405 ((c) >= '0' && (c) <= '9') || \
3406 (c) == '+' || (c) == '/')
3407
3408/* given that c is a base-64 character, what is its base-64 value? */
3409
3410#define FROM_BASE64(c) \
3411 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3412 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3413 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3414 (c) == '+' ? 62 : 63)
3415
3416/* What is the base-64 character of the bottom 6 bits of n? */
3417
3418#define TO_BASE64(n) \
3419 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3420
3421/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3422 * decoded as itself. We are permissive on decoding; the only ASCII
3423 * byte not decoding to itself is the + which begins a base64
3424 * string. */
3425
3426#define DECODE_DIRECT(c) \
3427 ((c) <= 127 && (c) != '+')
3428
3429/* The UTF-7 encoder treats ASCII characters differently according to
3430 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3431 * the above). See RFC2152. This array identifies these different
3432 * sets:
3433 * 0 : "Set D"
3434 * alphanumeric and '(),-./:?
3435 * 1 : "Set O"
3436 * !"#$%&*;<=>@[]^_`{|}
3437 * 2 : "whitespace"
3438 * ht nl cr sp
3439 * 3 : special (must be base64 encoded)
3440 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3441 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003442
Tim Petersced69f82003-09-16 20:30:58 +00003443static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003444char utf7_category[128] = {
3445/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3446 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3447/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3448 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3449/* sp ! " # $ % & ' ( ) * + , - . / */
3450 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3451/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3453/* @ A B C D E F G H I J K L M N O */
3454 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3455/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3457/* ` a b c d e f g h i j k l m n o */
3458 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3459/* p q r s t u v w x y z { | } ~ del */
3460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003461};
3462
Antoine Pitrou244651a2009-05-04 18:56:13 +00003463/* ENCODE_DIRECT: this character should be encoded as itself. The
3464 * answer depends on whether we are encoding set O as itself, and also
3465 * on whether we are encoding whitespace as itself. RFC2152 makes it
3466 * clear that the answers to these questions vary between
3467 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003468
Antoine Pitrou244651a2009-05-04 18:56:13 +00003469#define ENCODE_DIRECT(c, directO, directWS) \
3470 ((c) < 128 && (c) > 0 && \
3471 ((utf7_category[(c)] == 0) || \
3472 (directWS && (utf7_category[(c)] == 2)) || \
3473 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003474
Alexander Belopolsky40018472011-02-26 01:02:56 +00003475PyObject *
3476PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003477 Py_ssize_t size,
3478 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003479{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003480 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3481}
3482
Antoine Pitrou244651a2009-05-04 18:56:13 +00003483/* The decoder. The only state we preserve is our read position,
3484 * i.e. how many characters we have consumed. So if we end in the
3485 * middle of a shift sequence we have to back off the read position
3486 * and the output to the beginning of the sequence, otherwise we lose
3487 * all the shift state (seen bits, number of bits seen, high
3488 * surrogate). */
3489
Alexander Belopolsky40018472011-02-26 01:02:56 +00003490PyObject *
3491PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003492 Py_ssize_t size,
3493 const char *errors,
3494 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 Py_ssize_t startinpos;
3498 Py_ssize_t endinpos;
3499 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003500 const char *e;
3501 PyUnicodeObject *unicode;
3502 Py_UNICODE *p;
3503 const char *errmsg = "";
3504 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003505 Py_UNICODE *shiftOutStart;
3506 unsigned int base64bits = 0;
3507 unsigned long base64buffer = 0;
3508 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 PyObject *errorHandler = NULL;
3510 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003511
3512 unicode = _PyUnicode_New(size);
3513 if (!unicode)
3514 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003515 if (size == 0) {
3516 if (consumed)
3517 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003518 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003519 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003522 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003523 e = s + size;
3524
3525 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003528 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003529
Antoine Pitrou244651a2009-05-04 18:56:13 +00003530 if (inShift) { /* in a base-64 section */
3531 if (IS_BASE64(ch)) { /* consume a base-64 character */
3532 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3533 base64bits += 6;
3534 s++;
3535 if (base64bits >= 16) {
3536 /* we have enough bits for a UTF-16 value */
3537 Py_UNICODE outCh = (Py_UNICODE)
3538 (base64buffer >> (base64bits-16));
3539 base64bits -= 16;
3540 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3541 if (surrogate) {
3542 /* expecting a second surrogate */
3543 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3544#ifdef Py_UNICODE_WIDE
3545 *p++ = (((surrogate & 0x3FF)<<10)
3546 | (outCh & 0x3FF)) + 0x10000;
3547#else
3548 *p++ = surrogate;
3549 *p++ = outCh;
3550#endif
3551 surrogate = 0;
3552 }
3553 else {
3554 surrogate = 0;
3555 errmsg = "second surrogate missing";
3556 goto utf7Error;
3557 }
3558 }
3559 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3560 /* first surrogate */
3561 surrogate = outCh;
3562 }
3563 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3564 errmsg = "unexpected second surrogate";
3565 goto utf7Error;
3566 }
3567 else {
3568 *p++ = outCh;
3569 }
3570 }
3571 }
3572 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003573 inShift = 0;
3574 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003575 if (surrogate) {
3576 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003577 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003578 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003579 if (base64bits > 0) { /* left-over bits */
3580 if (base64bits >= 6) {
3581 /* We've seen at least one base-64 character */
3582 errmsg = "partial character in shift sequence";
3583 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003585 else {
3586 /* Some bits remain; they should be zero */
3587 if (base64buffer != 0) {
3588 errmsg = "non-zero padding bits in shift sequence";
3589 goto utf7Error;
3590 }
3591 }
3592 }
3593 if (ch != '-') {
3594 /* '-' is absorbed; other terminating
3595 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596 *p++ = ch;
3597 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003598 }
3599 }
3600 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 s++; /* consume '+' */
3603 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 s++;
3605 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003606 }
3607 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003608 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003609 shiftOutStart = p;
3610 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003611 }
3612 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614 *p++ = ch;
3615 s++;
3616 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617 else {
3618 startinpos = s-starts;
3619 s++;
3620 errmsg = "unexpected special character";
3621 goto utf7Error;
3622 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003623 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003624utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 outpos = p-PyUnicode_AS_UNICODE(unicode);
3626 endinpos = s-starts;
3627 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 errors, &errorHandler,
3629 "utf7", errmsg,
3630 &starts, &e, &startinpos, &endinpos, &exc, &s,
3631 &unicode, &outpos, &p))
3632 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003633 }
3634
Antoine Pitrou244651a2009-05-04 18:56:13 +00003635 /* end of string */
3636
3637 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3638 /* if we're in an inconsistent state, that's an error */
3639 if (surrogate ||
3640 (base64bits >= 6) ||
3641 (base64bits > 0 && base64buffer != 0)) {
3642 outpos = p-PyUnicode_AS_UNICODE(unicode);
3643 endinpos = size;
3644 if (unicode_decode_call_errorhandler(
3645 errors, &errorHandler,
3646 "utf7", "unterminated shift sequence",
3647 &starts, &e, &startinpos, &endinpos, &exc, &s,
3648 &unicode, &outpos, &p))
3649 goto onError;
3650 if (s < e)
3651 goto restart;
3652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003653 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003654
3655 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003656 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003657 if (inShift) {
3658 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003659 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660 }
3661 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003662 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003663 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003664 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665
Victor Stinnerfe226c02011-10-03 03:52:20 +02003666 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003667 goto onError;
3668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 Py_XDECREF(errorHandler);
3670 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003671 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 Py_DECREF(unicode);
3673 return NULL;
3674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675 return (PyObject *)unicode;
3676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_XDECREF(errorHandler);
3679 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003680 Py_DECREF(unicode);
3681 return NULL;
3682}
3683
3684
Alexander Belopolsky40018472011-02-26 01:02:56 +00003685PyObject *
3686PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003687 Py_ssize_t size,
3688 int base64SetO,
3689 int base64WhiteSpace,
3690 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003692 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003693 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003694 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003695 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003696 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003697 unsigned int base64bits = 0;
3698 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 char * out;
3700 char * start;
3701
3702 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003704
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003705 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003706 return PyErr_NoMemory();
3707
Antoine Pitrou244651a2009-05-04 18:56:13 +00003708 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003709 if (v == NULL)
3710 return NULL;
3711
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003712 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003713 for (;i < size; ++i) {
3714 Py_UNICODE ch = s[i];
3715
Antoine Pitrou244651a2009-05-04 18:56:13 +00003716 if (inShift) {
3717 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3718 /* shifting out */
3719 if (base64bits) { /* output remaining bits */
3720 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3721 base64buffer = 0;
3722 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003723 }
3724 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003725 /* Characters not in the BASE64 set implicitly unshift the sequence
3726 so no '-' is required, except if the character is itself a '-' */
3727 if (IS_BASE64(ch) || ch == '-') {
3728 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003730 *out++ = (char) ch;
3731 }
3732 else {
3733 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003734 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003735 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003736 else { /* not in a shift sequence */
3737 if (ch == '+') {
3738 *out++ = '+';
3739 *out++ = '-';
3740 }
3741 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3742 *out++ = (char) ch;
3743 }
3744 else {
3745 *out++ = '+';
3746 inShift = 1;
3747 goto encode_char;
3748 }
3749 }
3750 continue;
3751encode_char:
3752#ifdef Py_UNICODE_WIDE
3753 if (ch >= 0x10000) {
3754 /* code first surrogate */
3755 base64bits += 16;
3756 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3757 while (base64bits >= 6) {
3758 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3759 base64bits -= 6;
3760 }
3761 /* prepare second surrogate */
3762 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3763 }
3764#endif
3765 base64bits += 16;
3766 base64buffer = (base64buffer << 16) | ch;
3767 while (base64bits >= 6) {
3768 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3769 base64bits -= 6;
3770 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003772 if (base64bits)
3773 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3774 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003776 if (_PyBytes_Resize(&v, out - start) < 0)
3777 return NULL;
3778 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779}
3780
Antoine Pitrou244651a2009-05-04 18:56:13 +00003781#undef IS_BASE64
3782#undef FROM_BASE64
3783#undef TO_BASE64
3784#undef DECODE_DIRECT
3785#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003786
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787/* --- UTF-8 Codec -------------------------------------------------------- */
3788
Tim Petersced69f82003-09-16 20:30:58 +00003789static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003791 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3792 illegal prefix. See RFC 3629 for details */
3793 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3794 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003795 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3797 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3799 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003800 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3801 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3803 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3805 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3806 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3807 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3808 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809};
3810
Alexander Belopolsky40018472011-02-26 01:02:56 +00003811PyObject *
3812PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003813 Py_ssize_t size,
3814 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815{
Walter Dörwald69652032004-09-07 20:24:22 +00003816 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3817}
3818
Antoine Pitrouab868312009-01-10 15:40:25 +00003819/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3820#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3821
3822/* Mask to quickly check whether a C 'long' contains a
3823 non-ASCII, UTF8-encoded char. */
3824#if (SIZEOF_LONG == 8)
3825# define ASCII_CHAR_MASK 0x8080808080808080L
3826#elif (SIZEOF_LONG == 4)
3827# define ASCII_CHAR_MASK 0x80808080L
3828#else
3829# error C 'long' size should be either 4 or 8!
3830#endif
3831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832/* Scans a UTF-8 string and returns the maximum character to be expected,
3833 the size of the decoded unicode string and if any major errors were
3834 encountered.
3835
3836 This function does check basic UTF-8 sanity, it does however NOT CHECK
3837 if the string contains surrogates, and if all continuation bytes are
3838 within the correct ranges, these checks are performed in
3839 PyUnicode_DecodeUTF8Stateful.
3840
3841 If it sets has_errors to 1, it means the value of unicode_size and max_char
3842 will be bogus and you should not rely on useful information in them.
3843 */
3844static Py_UCS4
3845utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3846 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3847 int *has_errors)
3848{
3849 Py_ssize_t n;
3850 Py_ssize_t char_count = 0;
3851 Py_UCS4 max_char = 127, new_max;
3852 Py_UCS4 upper_bound;
3853 const unsigned char *p = (const unsigned char *)s;
3854 const unsigned char *end = p + string_size;
3855 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3856 int err = 0;
3857
3858 for (; p < end && !err; ++p, ++char_count) {
3859 /* Only check value if it's not a ASCII char... */
3860 if (*p < 0x80) {
3861 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3862 an explanation. */
3863 if (!((size_t) p & LONG_PTR_MASK)) {
3864 /* Help register allocation */
3865 register const unsigned char *_p = p;
3866 while (_p < aligned_end) {
3867 unsigned long value = *(unsigned long *) _p;
3868 if (value & ASCII_CHAR_MASK)
3869 break;
3870 _p += SIZEOF_LONG;
3871 char_count += SIZEOF_LONG;
3872 }
3873 p = _p;
3874 if (p == end)
3875 break;
3876 }
3877 }
3878 if (*p >= 0x80) {
3879 n = utf8_code_length[*p];
3880 new_max = max_char;
3881 switch (n) {
3882 /* invalid start byte */
3883 case 0:
3884 err = 1;
3885 break;
3886 case 2:
3887 /* Code points between 0x00FF and 0x07FF inclusive.
3888 Approximate the upper bound of the code point,
3889 if this flips over 255 we can be sure it will be more
3890 than 255 and the string will need 2 bytes per code coint,
3891 if it stays under or equal to 255, we can be sure 1 byte
3892 is enough.
3893 ((*p & 0b00011111) << 6) | 0b00111111 */
3894 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3895 if (max_char < upper_bound)
3896 new_max = upper_bound;
3897 /* Ensure we track at least that we left ASCII space. */
3898 if (new_max < 128)
3899 new_max = 128;
3900 break;
3901 case 3:
3902 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3903 always > 255 and <= 65535 and will always need 2 bytes. */
3904 if (max_char < 65535)
3905 new_max = 65535;
3906 break;
3907 case 4:
3908 /* Code point will be above 0xFFFF for sure in this case. */
3909 new_max = 65537;
3910 break;
3911 /* Internal error, this should be caught by the first if */
3912 case 1:
3913 default:
3914 assert(0 && "Impossible case in utf8_max_char_and_size");
3915 err = 1;
3916 }
3917 /* Instead of number of overall bytes for this code point,
3918 n containts the number of following bytes: */
3919 --n;
3920 /* Check if the follow up chars are all valid continuation bytes */
3921 if (n >= 1) {
3922 const unsigned char *cont;
3923 if ((p + n) >= end) {
3924 if (consumed == 0)
3925 /* incomplete data, non-incremental decoding */
3926 err = 1;
3927 break;
3928 }
3929 for (cont = p + 1; cont < (p + n); ++cont) {
3930 if ((*cont & 0xc0) != 0x80) {
3931 err = 1;
3932 break;
3933 }
3934 }
3935 p += n;
3936 }
3937 else
3938 err = 1;
3939 max_char = new_max;
3940 }
3941 }
3942
3943 if (unicode_size)
3944 *unicode_size = char_count;
3945 if (has_errors)
3946 *has_errors = err;
3947 return max_char;
3948}
3949
3950/* Similar to PyUnicode_WRITE but can also write into wstr field
3951 of the legacy unicode representation */
3952#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3953 do { \
3954 const int k_ = (kind); \
3955 if (k_ == PyUnicode_WCHAR_KIND) \
3956 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3957 else if (k_ == PyUnicode_1BYTE_KIND) \
3958 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3959 else if (k_ == PyUnicode_2BYTE_KIND) \
3960 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3961 else \
3962 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3963 } while (0)
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965PyObject *
3966PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967 Py_ssize_t size,
3968 const char *errors,
3969 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003973 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003974 Py_ssize_t startinpos;
3975 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003976 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003978 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 PyObject *errorHandler = NULL;
3980 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 Py_UCS4 maxchar = 0;
3982 Py_ssize_t unicode_size;
3983 Py_ssize_t i;
3984 int kind;
3985 void *data;
3986 int has_errors;
3987 Py_UNICODE *error_outptr;
3988#if SIZEOF_WCHAR_T == 2
3989 Py_ssize_t wchar_offset = 0;
3990#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Walter Dörwald69652032004-09-07 20:24:22 +00003992 if (size == 0) {
3993 if (consumed)
3994 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3998 consumed, &has_errors);
3999 if (has_errors) {
4000 unicode = _PyUnicode_New(size);
4001 if (!unicode)
4002 return NULL;
4003 kind = PyUnicode_WCHAR_KIND;
4004 data = PyUnicode_AS_UNICODE(unicode);
4005 assert(data != NULL);
4006 }
4007 else {
4008 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4009 if (!unicode)
4010 return NULL;
4011 /* When the string is ASCII only, just use memcpy and return.
4012 unicode_size may be != size if there is an incomplete UTF-8
4013 sequence at the end of the ASCII block. */
4014 if (maxchar < 128 && size == unicode_size) {
4015 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4016 return (PyObject *)unicode;
4017 }
4018 kind = PyUnicode_KIND(unicode);
4019 data = PyUnicode_DATA(unicode);
4020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004024 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025
4026 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004027 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028
4029 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004030 /* Fast path for runs of ASCII characters. Given that common UTF-8
4031 input will consist of an overwhelming majority of ASCII
4032 characters, we try to optimize for this case by checking
4033 as many characters as a C 'long' can contain.
4034 First, check if we can do an aligned read, as most CPUs have
4035 a penalty for unaligned reads.
4036 */
4037 if (!((size_t) s & LONG_PTR_MASK)) {
4038 /* Help register allocation */
4039 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004041 while (_s < aligned_end) {
4042 /* Read a whole long at a time (either 4 or 8 bytes),
4043 and do a fast unrolled copy if it only contains ASCII
4044 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 unsigned long value = *(unsigned long *) _s;
4046 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004047 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4049 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4050 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4051 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004052#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4054 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4055 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4056 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004057#endif
4058 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004060 }
4061 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004063 if (s == e)
4064 break;
4065 ch = (unsigned char)*s;
4066 }
4067 }
4068
4069 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 s++;
4072 continue;
4073 }
4074
4075 n = utf8_code_length[ch];
4076
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004077 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 if (consumed)
4079 break;
4080 else {
4081 errmsg = "unexpected end of data";
4082 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004083 endinpos = startinpos+1;
4084 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4085 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 goto utf8Error;
4087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089
4090 switch (n) {
4091
4092 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004093 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 startinpos = s-starts;
4095 endinpos = startinpos+1;
4096 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097
4098 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004099 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 startinpos = s-starts;
4101 endinpos = startinpos+1;
4102 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103
4104 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004105 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004106 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004108 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 goto utf8Error;
4110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004112 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 break;
4115
4116 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004117 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4118 will result in surrogates in range d800-dfff. Surrogates are
4119 not valid UTF-8 so they are rejected.
4120 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4121 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004122 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004123 (s[2] & 0xc0) != 0x80 ||
4124 ((unsigned char)s[0] == 0xE0 &&
4125 (unsigned char)s[1] < 0xA0) ||
4126 ((unsigned char)s[0] == 0xED &&
4127 (unsigned char)s[1] > 0x9F)) {
4128 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004130 endinpos = startinpos + 1;
4131
4132 /* if s[1] first two bits are 1 and 0, then the invalid
4133 continuation byte is s[2], so increment endinpos by 1,
4134 if not, s[1] is invalid and endinpos doesn't need to
4135 be incremented. */
4136 if ((s[1] & 0xC0) == 0x80)
4137 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto utf8Error;
4139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004141 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004142 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004143 break;
4144
4145 case 4:
4146 if ((s[1] & 0xc0) != 0x80 ||
4147 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004148 (s[3] & 0xc0) != 0x80 ||
4149 ((unsigned char)s[0] == 0xF0 &&
4150 (unsigned char)s[1] < 0x90) ||
4151 ((unsigned char)s[0] == 0xF4 &&
4152 (unsigned char)s[1] > 0x8F)) {
4153 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004155 endinpos = startinpos + 1;
4156 if ((s[1] & 0xC0) == 0x80) {
4157 endinpos++;
4158 if ((s[2] & 0xC0) == 0x80)
4159 endinpos++;
4160 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 goto utf8Error;
4162 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004163 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004164 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4165 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 /* If the string is flexible or we have native UCS-4, write
4168 directly.. */
4169 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4170 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 else {
4173 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175 /* translate from 10000..10FFFF to 0..FFFF */
4176 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 /* high surrogate = top 10 bits added to D800 */
4179 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4180 (Py_UNICODE)(0xD800 + (ch >> 10)));
4181
4182 /* low surrogate = bottom 10 bits added to DC00 */
4183 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4184 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4185 }
4186#if SIZEOF_WCHAR_T == 2
4187 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004188#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 }
4191 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004193
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 /* If this is not yet a resizable string, make it one.. */
4196 if (kind != PyUnicode_WCHAR_KIND) {
4197 const Py_UNICODE *u;
4198 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4199 if (!new_unicode)
4200 goto onError;
4201 u = PyUnicode_AsUnicode((PyObject *)unicode);
4202 if (!u)
4203 goto onError;
4204#if SIZEOF_WCHAR_T == 2
4205 i += wchar_offset;
4206#endif
4207 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4208 Py_DECREF(unicode);
4209 unicode = new_unicode;
4210 kind = 0;
4211 data = PyUnicode_AS_UNICODE(new_unicode);
4212 assert(data != NULL);
4213 }
4214 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 if (unicode_decode_call_errorhandler(
4216 errors, &errorHandler,
4217 "utf8", errmsg,
4218 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221 /* Update data because unicode_decode_call_errorhandler might have
4222 re-created or resized the unicode object. */
4223 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 /* Ensure the unicode_size calculation above was correct: */
4227 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4228
Walter Dörwald69652032004-09-07 20:24:22 +00004229 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 /* Adjust length and ready string when it contained errors and
4233 is of the old resizable kind. */
4234 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004235 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 goto onError;
4237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 Py_XDECREF(errorHandler);
4240 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004241 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 Py_DECREF(unicode);
4243 return NULL;
4244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 return (PyObject *)unicode;
4246
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 Py_XDECREF(errorHandler);
4249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 Py_DECREF(unicode);
4251 return NULL;
4252}
4253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004255
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004256#ifdef __APPLE__
4257
4258/* Simplified UTF-8 decoder using surrogateescape error handler,
4259 used to decode the command line arguments on Mac OS X. */
4260
4261wchar_t*
4262_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4263{
4264 int n;
4265 const char *e;
4266 wchar_t *unicode, *p;
4267
4268 /* Note: size will always be longer than the resulting Unicode
4269 character count */
4270 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4271 PyErr_NoMemory();
4272 return NULL;
4273 }
4274 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4275 if (!unicode)
4276 return NULL;
4277
4278 /* Unpack UTF-8 encoded data */
4279 p = unicode;
4280 e = s + size;
4281 while (s < e) {
4282 Py_UCS4 ch = (unsigned char)*s;
4283
4284 if (ch < 0x80) {
4285 *p++ = (wchar_t)ch;
4286 s++;
4287 continue;
4288 }
4289
4290 n = utf8_code_length[ch];
4291 if (s + n > e) {
4292 goto surrogateescape;
4293 }
4294
4295 switch (n) {
4296 case 0:
4297 case 1:
4298 goto surrogateescape;
4299
4300 case 2:
4301 if ((s[1] & 0xc0) != 0x80)
4302 goto surrogateescape;
4303 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4304 assert ((ch > 0x007F) && (ch <= 0x07FF));
4305 *p++ = (wchar_t)ch;
4306 break;
4307
4308 case 3:
4309 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4310 will result in surrogates in range d800-dfff. Surrogates are
4311 not valid UTF-8 so they are rejected.
4312 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4313 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4314 if ((s[1] & 0xc0) != 0x80 ||
4315 (s[2] & 0xc0) != 0x80 ||
4316 ((unsigned char)s[0] == 0xE0 &&
4317 (unsigned char)s[1] < 0xA0) ||
4318 ((unsigned char)s[0] == 0xED &&
4319 (unsigned char)s[1] > 0x9F)) {
4320
4321 goto surrogateescape;
4322 }
4323 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4324 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004325 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004326 break;
4327
4328 case 4:
4329 if ((s[1] & 0xc0) != 0x80 ||
4330 (s[2] & 0xc0) != 0x80 ||
4331 (s[3] & 0xc0) != 0x80 ||
4332 ((unsigned char)s[0] == 0xF0 &&
4333 (unsigned char)s[1] < 0x90) ||
4334 ((unsigned char)s[0] == 0xF4 &&
4335 (unsigned char)s[1] > 0x8F)) {
4336 goto surrogateescape;
4337 }
4338 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4339 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4340 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4341
4342#if SIZEOF_WCHAR_T == 4
4343 *p++ = (wchar_t)ch;
4344#else
4345 /* compute and append the two surrogates: */
4346
4347 /* translate from 10000..10FFFF to 0..FFFF */
4348 ch -= 0x10000;
4349
4350 /* high surrogate = top 10 bits added to D800 */
4351 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4352
4353 /* low surrogate = bottom 10 bits added to DC00 */
4354 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4355#endif
4356 break;
4357 }
4358 s += n;
4359 continue;
4360
4361 surrogateescape:
4362 *p++ = 0xDC00 + ch;
4363 s++;
4364 }
4365 *p = L'\0';
4366 return unicode;
4367}
4368
4369#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004371/* Primary internal function which creates utf8 encoded bytes objects.
4372
4373 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004374 and allocate exactly as much space needed at the end. Else allocate the
4375 maximum possible needed (4 result bytes per Unicode character), and return
4376 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004377*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004378PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380{
Tim Peters602f7402002-04-27 18:03:26 +00004381#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004382
Guido van Rossum98297ee2007-11-06 21:34:58 +00004383 Py_ssize_t i; /* index into s of next input byte */
4384 PyObject *result; /* result string object */
4385 char *p; /* next free byte in output buffer */
4386 Py_ssize_t nallocated; /* number of result bytes allocated */
4387 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004388 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004389 PyObject *errorHandler = NULL;
4390 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 int kind;
4392 void *data;
4393 Py_ssize_t size;
4394 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4395#if SIZEOF_WCHAR_T == 2
4396 Py_ssize_t wchar_offset = 0;
4397#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004399 if (!PyUnicode_Check(unicode)) {
4400 PyErr_BadArgument();
4401 return NULL;
4402 }
4403
4404 if (PyUnicode_READY(unicode) == -1)
4405 return NULL;
4406
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004407 if (PyUnicode_UTF8(unicode))
4408 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4409 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004410
4411 kind = PyUnicode_KIND(unicode);
4412 data = PyUnicode_DATA(unicode);
4413 size = PyUnicode_GET_LENGTH(unicode);
4414
Tim Peters602f7402002-04-27 18:03:26 +00004415 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416
Tim Peters602f7402002-04-27 18:03:26 +00004417 if (size <= MAX_SHORT_UNICHARS) {
4418 /* Write into the stack buffer; nallocated can't overflow.
4419 * At the end, we'll allocate exactly as much heap space as it
4420 * turns out we need.
4421 */
4422 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004423 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004424 p = stackbuf;
4425 }
4426 else {
4427 /* Overallocate on the heap, and give the excess back at the end. */
4428 nallocated = size * 4;
4429 if (nallocated / 4 != size) /* overflow! */
4430 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004431 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004432 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004433 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004434 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004435 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004436
Tim Peters602f7402002-04-27 18:03:26 +00004437 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004439
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004440 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004441 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004445 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004446 *p++ = (char)(0xc0 | (ch >> 6));
4447 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004448 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004449 Py_ssize_t newpos;
4450 PyObject *rep;
4451 Py_ssize_t repsize, k, startpos;
4452 startpos = i-1;
4453#if SIZEOF_WCHAR_T == 2
4454 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004455#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004456 rep = unicode_encode_call_errorhandler(
4457 errors, &errorHandler, "utf-8", "surrogates not allowed",
4458 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4459 &exc, startpos, startpos+1, &newpos);
4460 if (!rep)
4461 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 if (PyBytes_Check(rep))
4464 repsize = PyBytes_GET_SIZE(rep);
4465 else
4466 repsize = PyUnicode_GET_SIZE(rep);
4467
4468 if (repsize > 4) {
4469 Py_ssize_t offset;
4470
4471 if (result == NULL)
4472 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004473 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004476 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4477 /* integer overflow */
4478 PyErr_NoMemory();
4479 goto error;
4480 }
4481 nallocated += repsize - 4;
4482 if (result != NULL) {
4483 if (_PyBytes_Resize(&result, nallocated) < 0)
4484 goto error;
4485 } else {
4486 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004487 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004488 goto error;
4489 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4490 }
4491 p = PyBytes_AS_STRING(result) + offset;
4492 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 if (PyBytes_Check(rep)) {
4495 char *prep = PyBytes_AS_STRING(rep);
4496 for(k = repsize; k > 0; k--)
4497 *p++ = *prep++;
4498 } else /* rep is unicode */ {
4499 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4500 Py_UNICODE c;
4501
4502 for(k=0; k<repsize; k++) {
4503 c = prep[k];
4504 if (0x80 <= c) {
4505 raise_encode_exception(&exc, "utf-8",
4506 PyUnicode_AS_UNICODE(unicode),
4507 size, i-1, i,
4508 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004509 goto error;
4510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004511 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004512 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004515 } else if (ch < 0x10000) {
4516 *p++ = (char)(0xe0 | (ch >> 12));
4517 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4518 *p++ = (char)(0x80 | (ch & 0x3f));
4519 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004520 /* Encode UCS4 Unicode ordinals */
4521 *p++ = (char)(0xf0 | (ch >> 18));
4522 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4523 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4524 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525#if SIZEOF_WCHAR_T == 2
4526 wchar_offset++;
4527#endif
Tim Peters602f7402002-04-27 18:03:26 +00004528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004530
Guido van Rossum98297ee2007-11-06 21:34:58 +00004531 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004532 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004533 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004534 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004535 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004536 }
4537 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004538 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004539 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004540 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004541 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004544 Py_XDECREF(errorHandler);
4545 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004546 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004547 error:
4548 Py_XDECREF(errorHandler);
4549 Py_XDECREF(exc);
4550 Py_XDECREF(result);
4551 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004552
Tim Peters602f7402002-04-27 18:03:26 +00004553#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554}
4555
Alexander Belopolsky40018472011-02-26 01:02:56 +00004556PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4558 Py_ssize_t size,
4559 const char *errors)
4560{
4561 PyObject *v, *unicode;
4562
4563 unicode = PyUnicode_FromUnicode(s, size);
4564 if (unicode == NULL)
4565 return NULL;
4566 v = _PyUnicode_AsUTF8String(unicode, errors);
4567 Py_DECREF(unicode);
4568 return v;
4569}
4570
4571PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004572PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004574 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575}
4576
Walter Dörwald41980ca2007-08-16 21:55:45 +00004577/* --- UTF-32 Codec ------------------------------------------------------- */
4578
4579PyObject *
4580PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 Py_ssize_t size,
4582 const char *errors,
4583 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004584{
4585 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4586}
4587
4588PyObject *
4589PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 Py_ssize_t size,
4591 const char *errors,
4592 int *byteorder,
4593 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004594{
4595 const char *starts = s;
4596 Py_ssize_t startinpos;
4597 Py_ssize_t endinpos;
4598 Py_ssize_t outpos;
4599 PyUnicodeObject *unicode;
4600 Py_UNICODE *p;
4601#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004602 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004603 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004604#else
4605 const int pairs = 0;
4606#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004607 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004608 int bo = 0; /* assume native ordering by default */
4609 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610 /* Offsets from q for retrieving bytes in the right order. */
4611#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4612 int iorder[] = {0, 1, 2, 3};
4613#else
4614 int iorder[] = {3, 2, 1, 0};
4615#endif
4616 PyObject *errorHandler = NULL;
4617 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004618
Walter Dörwald41980ca2007-08-16 21:55:45 +00004619 q = (unsigned char *)s;
4620 e = q + size;
4621
4622 if (byteorder)
4623 bo = *byteorder;
4624
4625 /* Check for BOM marks (U+FEFF) in the input and adjust current
4626 byte order setting accordingly. In native mode, the leading BOM
4627 mark is skipped, in all other modes, it is copied to the output
4628 stream as-is (giving a ZWNBSP character). */
4629 if (bo == 0) {
4630 if (size >= 4) {
4631 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004633#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 if (bom == 0x0000FEFF) {
4635 q += 4;
4636 bo = -1;
4637 }
4638 else if (bom == 0xFFFE0000) {
4639 q += 4;
4640 bo = 1;
4641 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004642#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 if (bom == 0x0000FEFF) {
4644 q += 4;
4645 bo = 1;
4646 }
4647 else if (bom == 0xFFFE0000) {
4648 q += 4;
4649 bo = -1;
4650 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004651#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004653 }
4654
4655 if (bo == -1) {
4656 /* force LE */
4657 iorder[0] = 0;
4658 iorder[1] = 1;
4659 iorder[2] = 2;
4660 iorder[3] = 3;
4661 }
4662 else if (bo == 1) {
4663 /* force BE */
4664 iorder[0] = 3;
4665 iorder[1] = 2;
4666 iorder[2] = 1;
4667 iorder[3] = 0;
4668 }
4669
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004670 /* On narrow builds we split characters outside the BMP into two
4671 codepoints => count how much extra space we need. */
4672#ifndef Py_UNICODE_WIDE
4673 for (qq = q; qq < e; qq += 4)
4674 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4675 pairs++;
4676#endif
4677
4678 /* This might be one to much, because of a BOM */
4679 unicode = _PyUnicode_New((size+3)/4+pairs);
4680 if (!unicode)
4681 return NULL;
4682 if (size == 0)
4683 return (PyObject *)unicode;
4684
4685 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004686 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004687
Walter Dörwald41980ca2007-08-16 21:55:45 +00004688 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 Py_UCS4 ch;
4690 /* remaining bytes at the end? (size should be divisible by 4) */
4691 if (e-q<4) {
4692 if (consumed)
4693 break;
4694 errmsg = "truncated data";
4695 startinpos = ((const char *)q)-starts;
4696 endinpos = ((const char *)e)-starts;
4697 goto utf32Error;
4698 /* The remaining input chars are ignored if the callback
4699 chooses to skip the input */
4700 }
4701 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4702 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 if (ch >= 0x110000)
4705 {
4706 errmsg = "codepoint not in range(0x110000)";
4707 startinpos = ((const char *)q)-starts;
4708 endinpos = startinpos+4;
4709 goto utf32Error;
4710 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004711#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 if (ch >= 0x10000)
4713 {
4714 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4715 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4716 }
4717 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004718#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 *p++ = ch;
4720 q += 4;
4721 continue;
4722 utf32Error:
4723 outpos = p-PyUnicode_AS_UNICODE(unicode);
4724 if (unicode_decode_call_errorhandler(
4725 errors, &errorHandler,
4726 "utf32", errmsg,
4727 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4728 &unicode, &outpos, &p))
4729 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004730 }
4731
4732 if (byteorder)
4733 *byteorder = bo;
4734
4735 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737
4738 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004739 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004740 goto onError;
4741
4742 Py_XDECREF(errorHandler);
4743 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004744 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004745 Py_DECREF(unicode);
4746 return NULL;
4747 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004748 return (PyObject *)unicode;
4749
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004751 Py_DECREF(unicode);
4752 Py_XDECREF(errorHandler);
4753 Py_XDECREF(exc);
4754 return NULL;
4755}
4756
4757PyObject *
4758PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 Py_ssize_t size,
4760 const char *errors,
4761 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004762{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004763 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004764 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004765 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004766#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004767 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004768#else
4769 const int pairs = 0;
4770#endif
4771 /* Offsets from p for storing byte pairs in the right order. */
4772#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4773 int iorder[] = {0, 1, 2, 3};
4774#else
4775 int iorder[] = {3, 2, 1, 0};
4776#endif
4777
Benjamin Peterson29060642009-01-31 22:14:21 +00004778#define STORECHAR(CH) \
4779 do { \
4780 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4781 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4782 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4783 p[iorder[0]] = (CH) & 0xff; \
4784 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004785 } while(0)
4786
4787 /* In narrow builds we can output surrogate pairs as one codepoint,
4788 so we need less space. */
4789#ifndef Py_UNICODE_WIDE
4790 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4792 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4793 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004795 nsize = (size - pairs + (byteorder == 0));
4796 bytesize = nsize * 4;
4797 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004799 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004800 if (v == NULL)
4801 return NULL;
4802
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004804 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004807 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808
4809 if (byteorder == -1) {
4810 /* force LE */
4811 iorder[0] = 0;
4812 iorder[1] = 1;
4813 iorder[2] = 2;
4814 iorder[3] = 3;
4815 }
4816 else if (byteorder == 1) {
4817 /* force BE */
4818 iorder[0] = 3;
4819 iorder[1] = 2;
4820 iorder[2] = 1;
4821 iorder[3] = 0;
4822 }
4823
4824 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004826#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4828 Py_UCS4 ch2 = *s;
4829 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4830 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4831 s++;
4832 size--;
4833 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004834 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004835#endif
4836 STORECHAR(ch);
4837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004838
4839 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004840 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841#undef STORECHAR
4842}
4843
Alexander Belopolsky40018472011-02-26 01:02:56 +00004844PyObject *
4845PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004846{
4847 if (!PyUnicode_Check(unicode)) {
4848 PyErr_BadArgument();
4849 return NULL;
4850 }
4851 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 PyUnicode_GET_SIZE(unicode),
4853 NULL,
4854 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004855}
4856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857/* --- UTF-16 Codec ------------------------------------------------------- */
4858
Tim Peters772747b2001-08-09 22:21:55 +00004859PyObject *
4860PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_ssize_t size,
4862 const char *errors,
4863 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864{
Walter Dörwald69652032004-09-07 20:24:22 +00004865 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4866}
4867
Antoine Pitrouab868312009-01-10 15:40:25 +00004868/* Two masks for fast checking of whether a C 'long' may contain
4869 UTF16-encoded surrogate characters. This is an efficient heuristic,
4870 assuming that non-surrogate characters with a code point >= 0x8000 are
4871 rare in most input.
4872 FAST_CHAR_MASK is used when the input is in native byte ordering,
4873 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004874*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004875#if (SIZEOF_LONG == 8)
4876# define FAST_CHAR_MASK 0x8000800080008000L
4877# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4878#elif (SIZEOF_LONG == 4)
4879# define FAST_CHAR_MASK 0x80008000L
4880# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4881#else
4882# error C 'long' size should be either 4 or 8!
4883#endif
4884
Walter Dörwald69652032004-09-07 20:24:22 +00004885PyObject *
4886PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 Py_ssize_t size,
4888 const char *errors,
4889 int *byteorder,
4890 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004893 Py_ssize_t startinpos;
4894 Py_ssize_t endinpos;
4895 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 PyUnicodeObject *unicode;
4897 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004898 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004899 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004900 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004901 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004902 /* Offsets from q for retrieving byte pairs in the right order. */
4903#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4904 int ihi = 1, ilo = 0;
4905#else
4906 int ihi = 0, ilo = 1;
4907#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 PyObject *errorHandler = NULL;
4909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910
4911 /* Note: size will always be longer than the resulting Unicode
4912 character count */
4913 unicode = _PyUnicode_New(size);
4914 if (!unicode)
4915 return NULL;
4916 if (size == 0)
4917 return (PyObject *)unicode;
4918
4919 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004920 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004921 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004922 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923
4924 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004925 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004927 /* Check for BOM marks (U+FEFF) in the input and adjust current
4928 byte order setting accordingly. In native mode, the leading BOM
4929 mark is skipped, in all other modes, it is copied to the output
4930 stream as-is (giving a ZWNBSP character). */
4931 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004932 if (size >= 2) {
4933 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004934#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 if (bom == 0xFEFF) {
4936 q += 2;
4937 bo = -1;
4938 }
4939 else if (bom == 0xFFFE) {
4940 q += 2;
4941 bo = 1;
4942 }
Tim Petersced69f82003-09-16 20:30:58 +00004943#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 if (bom == 0xFEFF) {
4945 q += 2;
4946 bo = 1;
4947 }
4948 else if (bom == 0xFFFE) {
4949 q += 2;
4950 bo = -1;
4951 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004952#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
Tim Peters772747b2001-08-09 22:21:55 +00004956 if (bo == -1) {
4957 /* force LE */
4958 ihi = 1;
4959 ilo = 0;
4960 }
4961 else if (bo == 1) {
4962 /* force BE */
4963 ihi = 0;
4964 ilo = 1;
4965 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004966#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4967 native_ordering = ilo < ihi;
4968#else
4969 native_ordering = ilo > ihi;
4970#endif
Tim Peters772747b2001-08-09 22:21:55 +00004971
Antoine Pitrouab868312009-01-10 15:40:25 +00004972 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004973 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004975 /* First check for possible aligned read of a C 'long'. Unaligned
4976 reads are more expensive, better to defer to another iteration. */
4977 if (!((size_t) q & LONG_PTR_MASK)) {
4978 /* Fast path for runs of non-surrogate chars. */
4979 register const unsigned char *_q = q;
4980 Py_UNICODE *_p = p;
4981 if (native_ordering) {
4982 /* Native ordering is simple: as long as the input cannot
4983 possibly contain a surrogate char, do an unrolled copy
4984 of several 16-bit code points to the target object.
4985 The non-surrogate check is done on several input bytes
4986 at a time (as many as a C 'long' can contain). */
4987 while (_q < aligned_end) {
4988 unsigned long data = * (unsigned long *) _q;
4989 if (data & FAST_CHAR_MASK)
4990 break;
4991 _p[0] = ((unsigned short *) _q)[0];
4992 _p[1] = ((unsigned short *) _q)[1];
4993#if (SIZEOF_LONG == 8)
4994 _p[2] = ((unsigned short *) _q)[2];
4995 _p[3] = ((unsigned short *) _q)[3];
4996#endif
4997 _q += SIZEOF_LONG;
4998 _p += SIZEOF_LONG / 2;
4999 }
5000 }
5001 else {
5002 /* Byteswapped ordering is similar, but we must decompose
5003 the copy bytewise, and take care of zero'ing out the
5004 upper bytes if the target object is in 32-bit units
5005 (that is, in UCS-4 builds). */
5006 while (_q < aligned_end) {
5007 unsigned long data = * (unsigned long *) _q;
5008 if (data & SWAPPED_FAST_CHAR_MASK)
5009 break;
5010 /* Zero upper bytes in UCS-4 builds */
5011#if (Py_UNICODE_SIZE > 2)
5012 _p[0] = 0;
5013 _p[1] = 0;
5014#if (SIZEOF_LONG == 8)
5015 _p[2] = 0;
5016 _p[3] = 0;
5017#endif
5018#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005019 /* Issue #4916; UCS-4 builds on big endian machines must
5020 fill the two last bytes of each 4-byte unit. */
5021#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5022# define OFF 2
5023#else
5024# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005025#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005026 ((unsigned char *) _p)[OFF + 1] = _q[0];
5027 ((unsigned char *) _p)[OFF + 0] = _q[1];
5028 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5029 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5030#if (SIZEOF_LONG == 8)
5031 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5032 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5033 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5034 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5035#endif
5036#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005037 _q += SIZEOF_LONG;
5038 _p += SIZEOF_LONG / 2;
5039 }
5040 }
5041 p = _p;
5042 q = _q;
5043 if (q >= e)
5044 break;
5045 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047
Benjamin Peterson14339b62009-01-31 16:36:08 +00005048 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005049
5050 if (ch < 0xD800 || ch > 0xDFFF) {
5051 *p++ = ch;
5052 continue;
5053 }
5054
5055 /* UTF-16 code pair: */
5056 if (q > e) {
5057 errmsg = "unexpected end of data";
5058 startinpos = (((const char *)q) - 2) - starts;
5059 endinpos = ((const char *)e) + 1 - starts;
5060 goto utf16Error;
5061 }
5062 if (0xD800 <= ch && ch <= 0xDBFF) {
5063 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5064 q += 2;
5065 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005066#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 *p++ = ch;
5068 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005069#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005071#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 continue;
5073 }
5074 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005075 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 startinpos = (((const char *)q)-4)-starts;
5077 endinpos = startinpos+2;
5078 goto utf16Error;
5079 }
5080
Benjamin Peterson14339b62009-01-31 16:36:08 +00005081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 errmsg = "illegal encoding";
5083 startinpos = (((const char *)q)-2)-starts;
5084 endinpos = startinpos+2;
5085 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005086
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 utf16Error:
5088 outpos = p - PyUnicode_AS_UNICODE(unicode);
5089 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005090 errors,
5091 &errorHandler,
5092 "utf16", errmsg,
5093 &starts,
5094 (const char **)&e,
5095 &startinpos,
5096 &endinpos,
5097 &exc,
5098 (const char **)&q,
5099 &unicode,
5100 &outpos,
5101 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005104 /* remaining byte at the end? (size should be even) */
5105 if (e == q) {
5106 if (!consumed) {
5107 errmsg = "truncated data";
5108 startinpos = ((const char *)q) - starts;
5109 endinpos = ((const char *)e) + 1 - starts;
5110 outpos = p - PyUnicode_AS_UNICODE(unicode);
5111 if (unicode_decode_call_errorhandler(
5112 errors,
5113 &errorHandler,
5114 "utf16", errmsg,
5115 &starts,
5116 (const char **)&e,
5117 &startinpos,
5118 &endinpos,
5119 &exc,
5120 (const char **)&q,
5121 &unicode,
5122 &outpos,
5123 &p))
5124 goto onError;
5125 /* The remaining input chars are ignored if the callback
5126 chooses to skip the input */
5127 }
5128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130 if (byteorder)
5131 *byteorder = bo;
5132
Walter Dörwald69652032004-09-07 20:24:22 +00005133 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005137 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 goto onError;
5139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 Py_XDECREF(errorHandler);
5141 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005142 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143 Py_DECREF(unicode);
5144 return NULL;
5145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return (PyObject *)unicode;
5147
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 Py_XDECREF(errorHandler);
5151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return NULL;
5153}
5154
Antoine Pitrouab868312009-01-10 15:40:25 +00005155#undef FAST_CHAR_MASK
5156#undef SWAPPED_FAST_CHAR_MASK
5157
Tim Peters772747b2001-08-09 22:21:55 +00005158PyObject *
5159PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 Py_ssize_t size,
5161 const char *errors,
5162 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005164 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005165 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005166 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005167#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005168 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005169#else
5170 const int pairs = 0;
5171#endif
Tim Peters772747b2001-08-09 22:21:55 +00005172 /* Offsets from p for storing byte pairs in the right order. */
5173#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5174 int ihi = 1, ilo = 0;
5175#else
5176 int ihi = 0, ilo = 1;
5177#endif
5178
Benjamin Peterson29060642009-01-31 22:14:21 +00005179#define STORECHAR(CH) \
5180 do { \
5181 p[ihi] = ((CH) >> 8) & 0xff; \
5182 p[ilo] = (CH) & 0xff; \
5183 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005184 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005186#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005187 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 if (s[i] >= 0x10000)
5189 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005190#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005191 /* 2 * (size + pairs + (byteorder == 0)) */
5192 if (size > PY_SSIZE_T_MAX ||
5193 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005195 nsize = size + pairs + (byteorder == 0);
5196 bytesize = nsize * 2;
5197 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005198 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005199 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 if (v == NULL)
5201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005203 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005206 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005207 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005208
5209 if (byteorder == -1) {
5210 /* force LE */
5211 ihi = 1;
5212 ilo = 0;
5213 }
5214 else if (byteorder == 1) {
5215 /* force BE */
5216 ihi = 0;
5217 ilo = 1;
5218 }
5219
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005220 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 Py_UNICODE ch = *s++;
5222 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005223#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 if (ch >= 0x10000) {
5225 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5226 ch = 0xD800 | ((ch-0x10000) >> 10);
5227 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005228#endif
Tim Peters772747b2001-08-09 22:21:55 +00005229 STORECHAR(ch);
5230 if (ch2)
5231 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005232 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005233
5234 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005235 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005236#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237}
5238
Alexander Belopolsky40018472011-02-26 01:02:56 +00005239PyObject *
5240PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241{
5242 if (!PyUnicode_Check(unicode)) {
5243 PyErr_BadArgument();
5244 return NULL;
5245 }
5246 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 PyUnicode_GET_SIZE(unicode),
5248 NULL,
5249 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250}
5251
5252/* --- Unicode Escape Codec ----------------------------------------------- */
5253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005254/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5255 if all the escapes in the string make it still a valid ASCII string.
5256 Returns -1 if any escapes were found which cause the string to
5257 pop out of ASCII range. Otherwise returns the length of the
5258 required buffer to hold the string.
5259 */
5260Py_ssize_t
5261length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5262{
5263 const unsigned char *p = (const unsigned char *)s;
5264 const unsigned char *end = p + size;
5265 Py_ssize_t length = 0;
5266
5267 if (size < 0)
5268 return -1;
5269
5270 for (; p < end; ++p) {
5271 if (*p > 127) {
5272 /* Non-ASCII */
5273 return -1;
5274 }
5275 else if (*p != '\\') {
5276 /* Normal character */
5277 ++length;
5278 }
5279 else {
5280 /* Backslash-escape, check next char */
5281 ++p;
5282 /* Escape sequence reaches till end of string or
5283 non-ASCII follow-up. */
5284 if (p >= end || *p > 127)
5285 return -1;
5286 switch (*p) {
5287 case '\n':
5288 /* backslash + \n result in zero characters */
5289 break;
5290 case '\\': case '\'': case '\"':
5291 case 'b': case 'f': case 't':
5292 case 'n': case 'r': case 'v': case 'a':
5293 ++length;
5294 break;
5295 case '0': case '1': case '2': case '3':
5296 case '4': case '5': case '6': case '7':
5297 case 'x': case 'u': case 'U': case 'N':
5298 /* these do not guarantee ASCII characters */
5299 return -1;
5300 default:
5301 /* count the backslash + the other character */
5302 length += 2;
5303 }
5304 }
5305 }
5306 return length;
5307}
5308
5309/* Similar to PyUnicode_WRITE but either write into wstr field
5310 or treat string as ASCII. */
5311#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5312 do { \
5313 if ((kind) != PyUnicode_WCHAR_KIND) \
5314 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5315 else \
5316 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5317 } while (0)
5318
5319#define WRITE_WSTR(buf, index, value) \
5320 assert(kind == PyUnicode_WCHAR_KIND), \
5321 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5322
5323
Fredrik Lundh06d12682001-01-24 07:59:11 +00005324static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005325
Alexander Belopolsky40018472011-02-26 01:02:56 +00005326PyObject *
5327PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005328 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005329 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t startinpos;
5333 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005334 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005338 char* message;
5339 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 PyObject *errorHandler = NULL;
5341 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005342 Py_ssize_t ascii_length;
5343 Py_ssize_t i;
5344 int kind;
5345 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005347 ascii_length = length_of_escaped_ascii_string(s, size);
5348
5349 /* After length_of_escaped_ascii_string() there are two alternatives,
5350 either the string is pure ASCII with named escapes like \n, etc.
5351 and we determined it's exact size (common case)
5352 or it contains \x, \u, ... escape sequences. then we create a
5353 legacy wchar string and resize it at the end of this function. */
5354 if (ascii_length >= 0) {
5355 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5356 if (!v)
5357 goto onError;
5358 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5359 kind = PyUnicode_1BYTE_KIND;
5360 data = PyUnicode_DATA(v);
5361 }
5362 else {
5363 /* Escaped strings will always be longer than the resulting
5364 Unicode string, so we start with size here and then reduce the
5365 length after conversion to the true value.
5366 (but if the error callback returns a long replacement string
5367 we'll have to allocate more space) */
5368 v = _PyUnicode_New(size);
5369 if (!v)
5370 goto onError;
5371 kind = PyUnicode_WCHAR_KIND;
5372 data = PyUnicode_AS_UNICODE(v);
5373 }
5374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 if (size == 0)
5376 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005377 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 while (s < end) {
5381 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005382 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 if (kind == PyUnicode_WCHAR_KIND) {
5386 assert(i < _PyUnicode_WSTR_LENGTH(v));
5387 }
5388 else {
5389 /* The only case in which i == ascii_length is a backslash
5390 followed by a newline. */
5391 assert(i <= ascii_length);
5392 }
5393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 /* Non-escape characters are interpreted as Unicode ordinals */
5395 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 continue;
5398 }
5399
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* \ - Escapes */
5402 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005403 c = *s++;
5404 if (s > end)
5405 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406
5407 if (kind == PyUnicode_WCHAR_KIND) {
5408 assert(i < _PyUnicode_WSTR_LENGTH(v));
5409 }
5410 else {
5411 /* The only case in which i == ascii_length is a backslash
5412 followed by a newline. */
5413 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5414 }
5415
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005416 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5421 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5422 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5423 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5424 /* FF */
5425 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5426 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5427 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5428 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5429 /* VT */
5430 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5431 /* BEL, not classic C */
5432 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 case '0': case '1': case '2': case '3':
5436 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005437 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005438 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005439 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005440 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005441 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 break;
5445
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 /* hex escapes */
5447 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005449 digits = 2;
5450 message = "truncated \\xXX escape";
5451 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455 digits = 4;
5456 message = "truncated \\uXXXX escape";
5457 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005460 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005461 digits = 8;
5462 message = "truncated \\UXXXXXXXX escape";
5463 hexescape:
5464 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 if (s+digits>end) {
5467 endinpos = size;
5468 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 errors, &errorHandler,
5470 "unicodeescape", "end of string in escape sequence",
5471 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 goto nextByte;
5476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477 for (j = 0; j < digits; ++j) {
5478 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005479 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 endinpos = (s+j+1)-starts;
5481 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 errors, &errorHandler,
5484 "unicodeescape", message,
5485 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005487 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005490 }
5491 chr = (chr<<4) & ~0xF;
5492 if (c >= '0' && c <= '9')
5493 chr += c - '0';
5494 else if (c >= 'a' && c <= 'f')
5495 chr += 10 + c - 'a';
5496 else
5497 chr += 10 + c - 'A';
5498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005499 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005500 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 /* _decoding_error will have already written into the
5502 target buffer. */
5503 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005504 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005505 /* when we get here, chr is a 32-bit unicode character */
5506 if (chr <= 0xffff)
5507 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005508 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005509 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005510 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005511 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005512#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005513 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005514#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5517 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005518#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005519 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 errors, &errorHandler,
5524 "unicodeescape", "illegal Unicode character",
5525 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005527 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 break;
5531
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005533 case 'N':
5534 message = "malformed \\N character escape";
5535 if (ucnhash_CAPI == NULL) {
5536 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5538 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005539 if (ucnhash_CAPI == NULL)
5540 goto ucnhashError;
5541 }
5542 if (*s == '{') {
5543 const char *start = s+1;
5544 /* look for the closing brace */
5545 while (*s != '}' && s < end)
5546 s++;
5547 if (s > start && s < end && *s == '}') {
5548 /* found a name. look it up in the unicode database */
5549 message = "unknown Unicode character name";
5550 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5552 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553 goto store;
5554 }
5555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005556 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 errors, &errorHandler,
5560 "unicodeescape", message,
5561 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005565 break;
5566
5567 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005568 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 message = "\\ at end of string";
5571 s--;
5572 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 errors, &errorHandler,
5576 "unicodeescape", message,
5577 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005579 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005581 }
5582 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5584 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005585 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005586 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005589 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 /* Ensure the length prediction worked in case of ASCII strings */
5592 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5593
Victor Stinnerfe226c02011-10-03 03:52:20 +02005594 if (kind == PyUnicode_WCHAR_KIND)
5595 {
5596 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5597 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005598 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005599 Py_XDECREF(errorHandler);
5600 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005601 if (_PyUnicode_READY_REPLACE(&v)) {
5602 Py_DECREF(v);
5603 return NULL;
5604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005608 PyErr_SetString(
5609 PyExc_UnicodeError,
5610 "\\N escapes not supported (can't load unicodedata module)"
5611 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005612 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 Py_XDECREF(errorHandler);
5614 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005615 return NULL;
5616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 return NULL;
5622}
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624#undef WRITE_ASCII_OR_WSTR
5625#undef WRITE_WSTR
5626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627/* Return a Unicode-Escape string version of the Unicode object.
5628
5629 If quotes is true, the string is enclosed in u"" or u'' quotes as
5630 appropriate.
5631
5632*/
5633
Walter Dörwald79e913e2007-05-12 11:08:06 +00005634static const char *hexdigits = "0123456789abcdef";
5635
Alexander Belopolsky40018472011-02-26 01:02:56 +00005636PyObject *
5637PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005640 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005643#ifdef Py_UNICODE_WIDE
5644 const Py_ssize_t expandsize = 10;
5645#else
5646 const Py_ssize_t expandsize = 6;
5647#endif
5648
Thomas Wouters89f507f2006-12-13 04:49:30 +00005649 /* XXX(nnorwitz): rather than over-allocating, it would be
5650 better to choose a different scheme. Perhaps scan the
5651 first N-chars of the string and allocate based on that size.
5652 */
5653 /* Initial allocation is based on the longest-possible unichr
5654 escape.
5655
5656 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5657 unichr, so in this case it's the longest unichr escape. In
5658 narrow (UTF-16) builds this is five chars per source unichr
5659 since there are two unichrs in the surrogate pair, so in narrow
5660 (UTF-16) builds it's not the longest unichr escape.
5661
5662 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5663 so in the narrow (UTF-16) build case it's the longest unichr
5664 escape.
5665 */
5666
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005667 if (size == 0)
5668 return PyBytes_FromStringAndSize(NULL, 0);
5669
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005670 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005673 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 2
5675 + expandsize*size
5676 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (repr == NULL)
5678 return NULL;
5679
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005680 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 while (size-- > 0) {
5683 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005684
Walter Dörwald79e913e2007-05-12 11:08:06 +00005685 /* Escape backslashes */
5686 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 *p++ = '\\';
5688 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005689 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005690 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005691
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005692#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005693 /* Map 21-bit characters to '\U00xxxxxx' */
5694 else if (ch >= 0x10000) {
5695 *p++ = '\\';
5696 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005697 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5698 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5699 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5700 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5701 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5702 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5703 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5704 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005706 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005707#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5709 else if (ch >= 0xD800 && ch < 0xDC00) {
5710 Py_UNICODE ch2;
5711 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 ch2 = *s++;
5714 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005715 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5717 *p++ = '\\';
5718 *p++ = 'U';
5719 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5720 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5721 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5722 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5723 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5724 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5725 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5726 *p++ = hexdigits[ucs & 0x0000000F];
5727 continue;
5728 }
5729 /* Fall through: isolated surrogates are copied as-is */
5730 s--;
5731 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005732 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005733#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005736 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 *p++ = '\\';
5738 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005739 *p++ = hexdigits[(ch >> 12) & 0x000F];
5740 *p++ = hexdigits[(ch >> 8) & 0x000F];
5741 *p++ = hexdigits[(ch >> 4) & 0x000F];
5742 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005744
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005745 /* Map special whitespace to '\t', \n', '\r' */
5746 else if (ch == '\t') {
5747 *p++ = '\\';
5748 *p++ = 't';
5749 }
5750 else if (ch == '\n') {
5751 *p++ = '\\';
5752 *p++ = 'n';
5753 }
5754 else if (ch == '\r') {
5755 *p++ = '\\';
5756 *p++ = 'r';
5757 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005758
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005759 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005760 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005762 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005763 *p++ = hexdigits[(ch >> 4) & 0x000F];
5764 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005765 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005766
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 /* Copy everything else as-is */
5768 else
5769 *p++ = (char) ch;
5770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005772 assert(p - PyBytes_AS_STRING(repr) > 0);
5773 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5774 return NULL;
5775 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776}
5777
Alexander Belopolsky40018472011-02-26 01:02:56 +00005778PyObject *
5779PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005781 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 if (!PyUnicode_Check(unicode)) {
5783 PyErr_BadArgument();
5784 return NULL;
5785 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005786 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5787 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005788 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789}
5790
5791/* --- Raw Unicode Escape Codec ------------------------------------------- */
5792
Alexander Belopolsky40018472011-02-26 01:02:56 +00005793PyObject *
5794PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005795 Py_ssize_t size,
5796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005799 Py_ssize_t startinpos;
5800 Py_ssize_t endinpos;
5801 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 const char *end;
5805 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 PyObject *errorHandler = NULL;
5807 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005808
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 /* Escaped strings will always be longer than the resulting
5810 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 length after conversion to the true value. (But decoding error
5812 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 v = _PyUnicode_New(size);
5814 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 end = s + size;
5820 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 unsigned char c;
5822 Py_UCS4 x;
5823 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005824 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 /* Non-escape characters are interpreted as Unicode ordinals */
5827 if (*s != '\\') {
5828 *p++ = (unsigned char)*s++;
5829 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 startinpos = s-starts;
5832
5833 /* \u-escapes are only interpreted iff the number of leading
5834 backslashes if odd */
5835 bs = s;
5836 for (;s < end;) {
5837 if (*s != '\\')
5838 break;
5839 *p++ = (unsigned char)*s++;
5840 }
5841 if (((s - bs) & 1) == 0 ||
5842 s >= end ||
5843 (*s != 'u' && *s != 'U')) {
5844 continue;
5845 }
5846 p--;
5847 count = *s=='u' ? 4 : 8;
5848 s++;
5849
5850 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5851 outpos = p-PyUnicode_AS_UNICODE(v);
5852 for (x = 0, i = 0; i < count; ++i, ++s) {
5853 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005854 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 endinpos = s-starts;
5856 if (unicode_decode_call_errorhandler(
5857 errors, &errorHandler,
5858 "rawunicodeescape", "truncated \\uXXXX",
5859 &starts, &end, &startinpos, &endinpos, &exc, &s,
5860 &v, &outpos, &p))
5861 goto onError;
5862 goto nextByte;
5863 }
5864 x = (x<<4) & ~0xF;
5865 if (c >= '0' && c <= '9')
5866 x += c - '0';
5867 else if (c >= 'a' && c <= 'f')
5868 x += 10 + c - 'a';
5869 else
5870 x += 10 + c - 'A';
5871 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005872 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 /* UCS-2 character */
5874 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005875 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 /* UCS-4 character. Either store directly, or as
5877 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005878#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005880#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 x -= 0x10000L;
5882 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5883 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005884#endif
5885 } else {
5886 endinpos = s-starts;
5887 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005888 if (unicode_decode_call_errorhandler(
5889 errors, &errorHandler,
5890 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 &starts, &end, &startinpos, &endinpos, &exc, &s,
5892 &v, &outpos, &p))
5893 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 nextByte:
5896 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005898 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 Py_XDECREF(errorHandler);
5901 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005902 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005903 Py_DECREF(v);
5904 return NULL;
5905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005907
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 Py_XDECREF(errorHandler);
5911 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return NULL;
5913}
5914
Alexander Belopolsky40018472011-02-26 01:02:56 +00005915PyObject *
5916PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005917 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 char *p;
5921 char *q;
5922
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005923#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005924 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005925#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005926 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005927#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005928
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005929 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005931
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005932 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 if (repr == NULL)
5934 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005935 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005936 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 while (size-- > 0) {
5940 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005941#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 /* Map 32-bit characters to '\Uxxxxxxxx' */
5943 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005944 *p++ = '\\';
5945 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005946 *p++ = hexdigits[(ch >> 28) & 0xf];
5947 *p++ = hexdigits[(ch >> 24) & 0xf];
5948 *p++ = hexdigits[(ch >> 20) & 0xf];
5949 *p++ = hexdigits[(ch >> 16) & 0xf];
5950 *p++ = hexdigits[(ch >> 12) & 0xf];
5951 *p++ = hexdigits[(ch >> 8) & 0xf];
5952 *p++ = hexdigits[(ch >> 4) & 0xf];
5953 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005954 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005955 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005956#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5958 if (ch >= 0xD800 && ch < 0xDC00) {
5959 Py_UNICODE ch2;
5960 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 ch2 = *s++;
5963 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005964 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5966 *p++ = '\\';
5967 *p++ = 'U';
5968 *p++ = hexdigits[(ucs >> 28) & 0xf];
5969 *p++ = hexdigits[(ucs >> 24) & 0xf];
5970 *p++ = hexdigits[(ucs >> 20) & 0xf];
5971 *p++ = hexdigits[(ucs >> 16) & 0xf];
5972 *p++ = hexdigits[(ucs >> 12) & 0xf];
5973 *p++ = hexdigits[(ucs >> 8) & 0xf];
5974 *p++ = hexdigits[(ucs >> 4) & 0xf];
5975 *p++ = hexdigits[ucs & 0xf];
5976 continue;
5977 }
5978 /* Fall through: isolated surrogates are copied as-is */
5979 s--;
5980 size++;
5981 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005982#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* Map 16-bit characters to '\uxxxx' */
5984 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 *p++ = '\\';
5986 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005987 *p++ = hexdigits[(ch >> 12) & 0xf];
5988 *p++ = hexdigits[(ch >> 8) & 0xf];
5989 *p++ = hexdigits[(ch >> 4) & 0xf];
5990 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* Copy everything else as-is */
5993 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 *p++ = (char) ch;
5995 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005996 size = p - q;
5997
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 assert(size > 0);
5999 if (_PyBytes_Resize(&repr, size) < 0)
6000 return NULL;
6001 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
6005PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006007 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006009 PyErr_BadArgument();
6010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006012 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6013 PyUnicode_GET_SIZE(unicode));
6014
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006015 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006018/* --- Unicode Internal Codec ------------------------------------------- */
6019
Alexander Belopolsky40018472011-02-26 01:02:56 +00006020PyObject *
6021_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006022 Py_ssize_t size,
6023 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006024{
6025 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t startinpos;
6027 Py_ssize_t endinpos;
6028 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006029 PyUnicodeObject *v;
6030 Py_UNICODE *p;
6031 const char *end;
6032 const char *reason;
6033 PyObject *errorHandler = NULL;
6034 PyObject *exc = NULL;
6035
Neal Norwitzd43069c2006-01-08 01:12:10 +00006036#ifdef Py_UNICODE_WIDE
6037 Py_UNICODE unimax = PyUnicode_GetMax();
6038#endif
6039
Thomas Wouters89f507f2006-12-13 04:49:30 +00006040 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006041 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6042 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006044 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6045 as string was created with the old API. */
6046 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006048 p = PyUnicode_AS_UNICODE(v);
6049 end = s + size;
6050
6051 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006052 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006053 /* We have to sanity check the raw data, otherwise doom looms for
6054 some malformed UCS-4 data. */
6055 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006056#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006057 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006058#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006059 end-s < Py_UNICODE_SIZE
6060 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006062 startinpos = s - starts;
6063 if (end-s < Py_UNICODE_SIZE) {
6064 endinpos = end-starts;
6065 reason = "truncated input";
6066 }
6067 else {
6068 endinpos = s - starts + Py_UNICODE_SIZE;
6069 reason = "illegal code point (> 0x10FFFF)";
6070 }
6071 outpos = p - PyUnicode_AS_UNICODE(v);
6072 if (unicode_decode_call_errorhandler(
6073 errors, &errorHandler,
6074 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006075 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006076 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 goto onError;
6078 }
6079 }
6080 else {
6081 p++;
6082 s += Py_UNICODE_SIZE;
6083 }
6084 }
6085
Victor Stinnerfe226c02011-10-03 03:52:20 +02006086 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006087 goto onError;
6088 Py_XDECREF(errorHandler);
6089 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006090 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006091 Py_DECREF(v);
6092 return NULL;
6093 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006094 return (PyObject *)v;
6095
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006097 Py_XDECREF(v);
6098 Py_XDECREF(errorHandler);
6099 Py_XDECREF(exc);
6100 return NULL;
6101}
6102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103/* --- Latin-1 Codec ------------------------------------------------------ */
6104
Alexander Belopolsky40018472011-02-26 01:02:56 +00006105PyObject *
6106PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006107 Py_ssize_t size,
6108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006111 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112}
6113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115static void
6116make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006117 const char *encoding,
6118 const Py_UNICODE *unicode, Py_ssize_t size,
6119 Py_ssize_t startpos, Py_ssize_t endpos,
6120 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 *exceptionObject = PyUnicodeEncodeError_Create(
6124 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 }
6126 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6128 goto onError;
6129 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6130 goto onError;
6131 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6132 goto onError;
6133 return;
6134 onError:
6135 Py_DECREF(*exceptionObject);
6136 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
6138}
6139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141static void
6142raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006143 const char *encoding,
6144 const Py_UNICODE *unicode, Py_ssize_t size,
6145 Py_ssize_t startpos, Py_ssize_t endpos,
6146 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147{
6148 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152}
6153
6154/* error handling callback helper:
6155 build arguments, call the callback and check the arguments,
6156 put the result into newpos and return the replacement string, which
6157 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006158static PyObject *
6159unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006160 PyObject **errorHandler,
6161 const char *encoding, const char *reason,
6162 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6163 Py_ssize_t startpos, Py_ssize_t endpos,
6164 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006166 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167
6168 PyObject *restuple;
6169 PyObject *resunicode;
6170
6171 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 }
6176
6177 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181
6182 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006187 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 Py_DECREF(restuple);
6189 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006191 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 &resunicode, newpos)) {
6193 Py_DECREF(restuple);
6194 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006196 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6197 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6198 Py_DECREF(restuple);
6199 return NULL;
6200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006203 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6205 Py_DECREF(restuple);
6206 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006207 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_INCREF(resunicode);
6209 Py_DECREF(restuple);
6210 return resunicode;
6211}
6212
Alexander Belopolsky40018472011-02-26 01:02:56 +00006213static PyObject *
6214unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006215 Py_ssize_t size,
6216 const char *errors,
6217 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218{
6219 /* output object */
6220 PyObject *res;
6221 /* pointers to the beginning and end+1 of input */
6222 const Py_UNICODE *startp = p;
6223 const Py_UNICODE *endp = p + size;
6224 /* pointer to the beginning of the unencodable characters */
6225 /* const Py_UNICODE *badp = NULL; */
6226 /* pointer into the output */
6227 char *str;
6228 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006229 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006230 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6231 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006232 PyObject *errorHandler = NULL;
6233 PyObject *exc = NULL;
6234 /* the following variable is used for caching string comparisons
6235 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6236 int known_errorHandler = -1;
6237
6238 /* allocate enough for a simple encoding without
6239 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006240 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006241 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006242 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006244 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006245 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 ressize = size;
6247
6248 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* can we encode this? */
6252 if (c<limit) {
6253 /* no overflow check, because we know that the space is enough */
6254 *str++ = (char)c;
6255 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006256 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 else {
6258 Py_ssize_t unicodepos = p-startp;
6259 Py_ssize_t requiredsize;
6260 PyObject *repunicode;
6261 Py_ssize_t repsize;
6262 Py_ssize_t newpos;
6263 Py_ssize_t respos;
6264 Py_UNICODE *uni2;
6265 /* startpos for collecting unencodable chars */
6266 const Py_UNICODE *collstart = p;
6267 const Py_UNICODE *collend = p;
6268 /* find all unecodable characters */
6269 while ((collend < endp) && ((*collend)>=limit))
6270 ++collend;
6271 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6272 if (known_errorHandler==-1) {
6273 if ((errors==NULL) || (!strcmp(errors, "strict")))
6274 known_errorHandler = 1;
6275 else if (!strcmp(errors, "replace"))
6276 known_errorHandler = 2;
6277 else if (!strcmp(errors, "ignore"))
6278 known_errorHandler = 3;
6279 else if (!strcmp(errors, "xmlcharrefreplace"))
6280 known_errorHandler = 4;
6281 else
6282 known_errorHandler = 0;
6283 }
6284 switch (known_errorHandler) {
6285 case 1: /* strict */
6286 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6287 goto onError;
6288 case 2: /* replace */
6289 while (collstart++<collend)
6290 *str++ = '?'; /* fall through */
6291 case 3: /* ignore */
6292 p = collend;
6293 break;
6294 case 4: /* xmlcharrefreplace */
6295 respos = str - PyBytes_AS_STRING(res);
6296 /* determine replacement size (temporarily (mis)uses p) */
6297 for (p = collstart, repsize = 0; p < collend; ++p) {
6298 if (*p<10)
6299 repsize += 2+1+1;
6300 else if (*p<100)
6301 repsize += 2+2+1;
6302 else if (*p<1000)
6303 repsize += 2+3+1;
6304 else if (*p<10000)
6305 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006306#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 else
6308 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006309#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 else if (*p<100000)
6311 repsize += 2+5+1;
6312 else if (*p<1000000)
6313 repsize += 2+6+1;
6314 else
6315 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006316#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 }
6318 requiredsize = respos+repsize+(endp-collend);
6319 if (requiredsize > ressize) {
6320 if (requiredsize<2*ressize)
6321 requiredsize = 2*ressize;
6322 if (_PyBytes_Resize(&res, requiredsize))
6323 goto onError;
6324 str = PyBytes_AS_STRING(res) + respos;
6325 ressize = requiredsize;
6326 }
6327 /* generate replacement (temporarily (mis)uses p) */
6328 for (p = collstart; p < collend; ++p) {
6329 str += sprintf(str, "&#%d;", (int)*p);
6330 }
6331 p = collend;
6332 break;
6333 default:
6334 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6335 encoding, reason, startp, size, &exc,
6336 collstart-startp, collend-startp, &newpos);
6337 if (repunicode == NULL)
6338 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006339 if (PyBytes_Check(repunicode)) {
6340 /* Directly copy bytes result to output. */
6341 repsize = PyBytes_Size(repunicode);
6342 if (repsize > 1) {
6343 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006344 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006345 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6346 Py_DECREF(repunicode);
6347 goto onError;
6348 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006349 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006350 ressize += repsize-1;
6351 }
6352 memcpy(str, PyBytes_AsString(repunicode), repsize);
6353 str += repsize;
6354 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006355 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006356 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 /* need more space? (at least enough for what we
6359 have+the replacement+the rest of the string, so
6360 we won't have to check space for encodable characters) */
6361 respos = str - PyBytes_AS_STRING(res);
6362 repsize = PyUnicode_GET_SIZE(repunicode);
6363 requiredsize = respos+repsize+(endp-collend);
6364 if (requiredsize > ressize) {
6365 if (requiredsize<2*ressize)
6366 requiredsize = 2*ressize;
6367 if (_PyBytes_Resize(&res, requiredsize)) {
6368 Py_DECREF(repunicode);
6369 goto onError;
6370 }
6371 str = PyBytes_AS_STRING(res) + respos;
6372 ressize = requiredsize;
6373 }
6374 /* check if there is anything unencodable in the replacement
6375 and copy it to the output */
6376 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6377 c = *uni2;
6378 if (c >= limit) {
6379 raise_encode_exception(&exc, encoding, startp, size,
6380 unicodepos, unicodepos+1, reason);
6381 Py_DECREF(repunicode);
6382 goto onError;
6383 }
6384 *str = (char)c;
6385 }
6386 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006387 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006389 }
6390 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006391 /* Resize if we allocated to much */
6392 size = str - PyBytes_AS_STRING(res);
6393 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006394 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006395 if (_PyBytes_Resize(&res, size) < 0)
6396 goto onError;
6397 }
6398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006401 return res;
6402
6403 onError:
6404 Py_XDECREF(res);
6405 Py_XDECREF(errorHandler);
6406 Py_XDECREF(exc);
6407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408}
6409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
6411PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006412 Py_ssize_t size,
6413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
Alexander Belopolsky40018472011-02-26 01:02:56 +00006418PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006419_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
6421 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 PyErr_BadArgument();
6423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425 if (PyUnicode_READY(unicode) == -1)
6426 return NULL;
6427 /* Fast path: if it is a one-byte string, construct
6428 bytes object directly. */
6429 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6430 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6431 PyUnicode_GET_LENGTH(unicode));
6432 /* Non-Latin-1 characters present. Defer to above function to
6433 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006436 errors);
6437}
6438
6439PyObject*
6440PyUnicode_AsLatin1String(PyObject *unicode)
6441{
6442 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443}
6444
6445/* --- 7-bit ASCII Codec -------------------------------------------------- */
6446
Alexander Belopolsky40018472011-02-26 01:02:56 +00006447PyObject *
6448PyUnicode_DecodeASCII(const char *s,
6449 Py_ssize_t size,
6450 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 PyUnicodeObject *v;
6454 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 Py_ssize_t startinpos;
6456 Py_ssize_t endinpos;
6457 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006459 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 PyObject *errorHandler = NULL;
6461 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006462 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006463
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006465 if (size == 1 && *(unsigned char*)s < 128)
6466 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6467
6468 /* Fast path. Assume the input actually *is* ASCII, and allocate
6469 a single-block Unicode object with that assumption. If there is
6470 an error, drop the object and start over. */
6471 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6472 if (v == NULL)
6473 goto onError;
6474 d = PyUnicode_1BYTE_DATA(v);
6475 for (i = 0; i < size; i++) {
6476 unsigned char ch = ((unsigned char*)s)[i];
6477 if (ch < 128)
6478 d[i] = ch;
6479 else
6480 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006482 if (i == size)
6483 return (PyObject*)v;
6484 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 v = _PyUnicode_New(size);
6487 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006492 e = s + size;
6493 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 register unsigned char c = (unsigned char)*s;
6495 if (c < 128) {
6496 *p++ = c;
6497 ++s;
6498 }
6499 else {
6500 startinpos = s-starts;
6501 endinpos = startinpos + 1;
6502 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6503 if (unicode_decode_call_errorhandler(
6504 errors, &errorHandler,
6505 "ascii", "ordinal not in range(128)",
6506 &starts, &e, &startinpos, &endinpos, &exc, &s,
6507 &v, &outpos, &p))
6508 goto onError;
6509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006511 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006512 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 Py_XDECREF(errorHandler);
6515 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006516 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517 Py_DECREF(v);
6518 return NULL;
6519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006521
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 Py_XDECREF(errorHandler);
6525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return NULL;
6527}
6528
Alexander Belopolsky40018472011-02-26 01:02:56 +00006529PyObject *
6530PyUnicode_EncodeASCII(const Py_UNICODE *p,
6531 Py_ssize_t size,
6532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
Alexander Belopolsky40018472011-02-26 01:02:56 +00006537PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006538_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
6540 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyErr_BadArgument();
6542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544 if (PyUnicode_READY(unicode) == -1)
6545 return NULL;
6546 /* Fast path: if it is an ASCII-only string, construct bytes object
6547 directly. Else defer to above function to raise the exception. */
6548 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6549 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6550 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006553 errors);
6554}
6555
6556PyObject *
6557PyUnicode_AsASCIIString(PyObject *unicode)
6558{
6559 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560}
6561
Victor Stinner99b95382011-07-04 14:23:54 +02006562#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006563
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006564/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006565
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006566#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567#define NEED_RETRY
6568#endif
6569
6570/* XXX This code is limited to "true" double-byte encodings, as
6571 a) it assumes an incomplete character consists of a single byte, and
6572 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006574
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575static int
6576is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006577{
6578 const char *curr = s + offset;
6579
6580 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 const char *prev = CharPrev(s, curr);
6582 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583 }
6584 return 0;
6585}
6586
6587/*
6588 * Decode MBCS string into unicode object. If 'final' is set, converts
6589 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6590 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591static int
6592decode_mbcs(PyUnicodeObject **v,
6593 const char *s, /* MBCS string */
6594 int size, /* sizeof MBCS string */
6595 int final,
6596 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597{
6598 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006599 Py_ssize_t n;
6600 DWORD usize;
6601 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602
6603 assert(size >= 0);
6604
Victor Stinner554f3f02010-06-16 23:33:54 +00006605 /* check and handle 'errors' arg */
6606 if (errors==NULL || strcmp(errors, "strict")==0)
6607 flags = MB_ERR_INVALID_CHARS;
6608 else if (strcmp(errors, "ignore")==0)
6609 flags = 0;
6610 else {
6611 PyErr_Format(PyExc_ValueError,
6612 "mbcs encoding does not support errors='%s'",
6613 errors);
6614 return -1;
6615 }
6616
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617 /* Skip trailing lead-byte unless 'final' is set */
6618 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620
6621 /* First get the size of the result */
6622 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006623 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6624 if (usize==0)
6625 goto mbcs_decode_error;
6626 } else
6627 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006628
6629 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 /* Create unicode object */
6631 *v = _PyUnicode_New(usize);
6632 if (*v == NULL)
6633 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006634 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006635 }
6636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 /* Extend unicode object */
6638 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006639 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006641 }
6642
6643 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006644 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006646 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6647 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006649 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006650 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006651
6652mbcs_decode_error:
6653 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6654 we raise a UnicodeDecodeError - else it is a 'generic'
6655 windows error
6656 */
6657 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6658 /* Ideally, we should get reason from FormatMessage - this
6659 is the Windows 2000 English version of the message
6660 */
6661 PyObject *exc = NULL;
6662 const char *reason = "No mapping for the Unicode character exists "
6663 "in the target multi-byte code page.";
6664 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6665 if (exc != NULL) {
6666 PyCodec_StrictErrors(exc);
6667 Py_DECREF(exc);
6668 }
6669 } else {
6670 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6671 }
6672 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673}
6674
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675PyObject *
6676PyUnicode_DecodeMBCSStateful(const char *s,
6677 Py_ssize_t size,
6678 const char *errors,
6679 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006680{
6681 PyUnicodeObject *v = NULL;
6682 int done;
6683
6684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686
6687#ifdef NEED_RETRY
6688 retry:
6689 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006690 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 else
6692#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006693 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694
6695 if (done < 0) {
6696 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006698 }
6699
6700 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702
6703#ifdef NEED_RETRY
6704 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 s += done;
6706 size -= done;
6707 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708 }
6709#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006710 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006711 Py_DECREF(v);
6712 return NULL;
6713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 return (PyObject *)v;
6715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717PyObject *
6718PyUnicode_DecodeMBCS(const char *s,
6719 Py_ssize_t size,
6720 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006721{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6723}
6724
6725/*
6726 * Convert unicode into string object (MBCS).
6727 * Returns 0 if succeed, -1 otherwise.
6728 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729static int
6730encode_mbcs(PyObject **repr,
6731 const Py_UNICODE *p, /* unicode */
6732 int size, /* size of unicode */
6733 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006734{
Victor Stinner554f3f02010-06-16 23:33:54 +00006735 BOOL usedDefaultChar = FALSE;
6736 BOOL *pusedDefaultChar;
6737 int mbcssize;
6738 Py_ssize_t n;
6739 PyObject *exc = NULL;
6740 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006741
6742 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006743
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 /* check and handle 'errors' arg */
6745 if (errors==NULL || strcmp(errors, "strict")==0) {
6746 flags = WC_NO_BEST_FIT_CHARS;
6747 pusedDefaultChar = &usedDefaultChar;
6748 } else if (strcmp(errors, "replace")==0) {
6749 flags = 0;
6750 pusedDefaultChar = NULL;
6751 } else {
6752 PyErr_Format(PyExc_ValueError,
6753 "mbcs encoding does not support errors='%s'",
6754 errors);
6755 return -1;
6756 }
6757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006758 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006760 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6761 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 if (mbcssize == 0) {
6763 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6764 return -1;
6765 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006766 /* If we used a default char, then we failed! */
6767 if (pusedDefaultChar && *pusedDefaultChar)
6768 goto mbcs_encode_error;
6769 } else {
6770 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006771 }
6772
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006773 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* Create string object */
6775 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6776 if (*repr == NULL)
6777 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006778 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779 }
6780 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* Extend string object */
6782 n = PyBytes_Size(*repr);
6783 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6784 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 }
6786
6787 /* Do the conversion */
6788 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006790 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6791 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6793 return -1;
6794 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006795 if (pusedDefaultChar && *pusedDefaultChar)
6796 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006799
6800mbcs_encode_error:
6801 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6802 Py_XDECREF(exc);
6803 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006804}
6805
Alexander Belopolsky40018472011-02-26 01:02:56 +00006806PyObject *
6807PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6808 Py_ssize_t size,
6809 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006810{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811 PyObject *repr = NULL;
6812 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006813
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006817 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818 else
6819#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006820 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006821
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 Py_XDECREF(repr);
6824 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006825 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006826
6827#ifdef NEED_RETRY
6828 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 p += INT_MAX;
6830 size -= INT_MAX;
6831 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832 }
6833#endif
6834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006835 return repr;
6836}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006837
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838PyObject *
6839PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006840{
6841 if (!PyUnicode_Check(unicode)) {
6842 PyErr_BadArgument();
6843 return NULL;
6844 }
6845 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 PyUnicode_GET_SIZE(unicode),
6847 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006848}
6849
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850#undef NEED_RETRY
6851
Victor Stinner99b95382011-07-04 14:23:54 +02006852#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854/* --- Character Mapping Codec -------------------------------------------- */
6855
Alexander Belopolsky40018472011-02-26 01:02:56 +00006856PyObject *
6857PyUnicode_DecodeCharmap(const char *s,
6858 Py_ssize_t size,
6859 PyObject *mapping,
6860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006863 Py_ssize_t startinpos;
6864 Py_ssize_t endinpos;
6865 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 PyUnicodeObject *v;
6868 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006869 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 PyObject *errorHandler = NULL;
6871 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006872 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006873 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 /* Default to Latin-1 */
6876 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
6879 v = _PyUnicode_New(size);
6880 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006886 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 mapstring = PyUnicode_AS_UNICODE(mapping);
6888 maplen = PyUnicode_GET_SIZE(mapping);
6889 while (s < e) {
6890 unsigned char ch = *s;
6891 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 if (ch < maplen)
6894 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 if (x == 0xfffe) {
6897 /* undefined mapping */
6898 outpos = p-PyUnicode_AS_UNICODE(v);
6899 startinpos = s-starts;
6900 endinpos = startinpos+1;
6901 if (unicode_decode_call_errorhandler(
6902 errors, &errorHandler,
6903 "charmap", "character maps to <undefined>",
6904 &starts, &e, &startinpos, &endinpos, &exc, &s,
6905 &v, &outpos, &p)) {
6906 goto onError;
6907 }
6908 continue;
6909 }
6910 *p++ = x;
6911 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006912 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006913 }
6914 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 while (s < e) {
6916 unsigned char ch = *s;
6917 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006918
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6920 w = PyLong_FromLong((long)ch);
6921 if (w == NULL)
6922 goto onError;
6923 x = PyObject_GetItem(mapping, w);
6924 Py_DECREF(w);
6925 if (x == NULL) {
6926 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6927 /* No mapping found means: mapping is undefined. */
6928 PyErr_Clear();
6929 x = Py_None;
6930 Py_INCREF(x);
6931 } else
6932 goto onError;
6933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006934
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 /* Apply mapping */
6936 if (PyLong_Check(x)) {
6937 long value = PyLong_AS_LONG(x);
6938 if (value < 0 || value > 65535) {
6939 PyErr_SetString(PyExc_TypeError,
6940 "character mapping must be in range(65536)");
6941 Py_DECREF(x);
6942 goto onError;
6943 }
6944 *p++ = (Py_UNICODE)value;
6945 }
6946 else if (x == Py_None) {
6947 /* undefined mapping */
6948 outpos = p-PyUnicode_AS_UNICODE(v);
6949 startinpos = s-starts;
6950 endinpos = startinpos+1;
6951 if (unicode_decode_call_errorhandler(
6952 errors, &errorHandler,
6953 "charmap", "character maps to <undefined>",
6954 &starts, &e, &startinpos, &endinpos, &exc, &s,
6955 &v, &outpos, &p)) {
6956 Py_DECREF(x);
6957 goto onError;
6958 }
6959 Py_DECREF(x);
6960 continue;
6961 }
6962 else if (PyUnicode_Check(x)) {
6963 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (targetsize == 1)
6966 /* 1-1 mapping */
6967 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 else if (targetsize > 1) {
6970 /* 1-n mapping */
6971 if (targetsize > extrachars) {
6972 /* resize first */
6973 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6974 Py_ssize_t needed = (targetsize - extrachars) + \
6975 (targetsize << 2);
6976 extrachars += needed;
6977 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006978 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 PyUnicode_GET_SIZE(v) + needed) < 0) {
6980 Py_DECREF(x);
6981 goto onError;
6982 }
6983 p = PyUnicode_AS_UNICODE(v) + oldpos;
6984 }
6985 Py_UNICODE_COPY(p,
6986 PyUnicode_AS_UNICODE(x),
6987 targetsize);
6988 p += targetsize;
6989 extrachars -= targetsize;
6990 }
6991 /* 1-0 mapping: skip the character */
6992 }
6993 else {
6994 /* wrong return value */
6995 PyErr_SetString(PyExc_TypeError,
6996 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006997 Py_DECREF(x);
6998 goto onError;
6999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 Py_DECREF(x);
7001 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 }
7004 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007005 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007007 Py_XDECREF(errorHandler);
7008 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007009 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010 Py_DECREF(v);
7011 return NULL;
7012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007014
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 Py_XDECREF(errorHandler);
7017 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 Py_XDECREF(v);
7019 return NULL;
7020}
7021
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007022/* Charmap encoding: the lookup table */
7023
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 PyObject_HEAD
7026 unsigned char level1[32];
7027 int count2, count3;
7028 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007029};
7030
7031static PyObject*
7032encoding_map_size(PyObject *obj, PyObject* args)
7033{
7034 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007037}
7038
7039static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007040 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 PyDoc_STR("Return the size (in bytes) of this object") },
7042 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007043};
7044
7045static void
7046encoding_map_dealloc(PyObject* o)
7047{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007049}
7050
7051static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007052 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 "EncodingMap", /*tp_name*/
7054 sizeof(struct encoding_map), /*tp_basicsize*/
7055 0, /*tp_itemsize*/
7056 /* methods */
7057 encoding_map_dealloc, /*tp_dealloc*/
7058 0, /*tp_print*/
7059 0, /*tp_getattr*/
7060 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007061 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 0, /*tp_repr*/
7063 0, /*tp_as_number*/
7064 0, /*tp_as_sequence*/
7065 0, /*tp_as_mapping*/
7066 0, /*tp_hash*/
7067 0, /*tp_call*/
7068 0, /*tp_str*/
7069 0, /*tp_getattro*/
7070 0, /*tp_setattro*/
7071 0, /*tp_as_buffer*/
7072 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7073 0, /*tp_doc*/
7074 0, /*tp_traverse*/
7075 0, /*tp_clear*/
7076 0, /*tp_richcompare*/
7077 0, /*tp_weaklistoffset*/
7078 0, /*tp_iter*/
7079 0, /*tp_iternext*/
7080 encoding_map_methods, /*tp_methods*/
7081 0, /*tp_members*/
7082 0, /*tp_getset*/
7083 0, /*tp_base*/
7084 0, /*tp_dict*/
7085 0, /*tp_descr_get*/
7086 0, /*tp_descr_set*/
7087 0, /*tp_dictoffset*/
7088 0, /*tp_init*/
7089 0, /*tp_alloc*/
7090 0, /*tp_new*/
7091 0, /*tp_free*/
7092 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007093};
7094
7095PyObject*
7096PyUnicode_BuildEncodingMap(PyObject* string)
7097{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007098 PyObject *result;
7099 struct encoding_map *mresult;
7100 int i;
7101 int need_dict = 0;
7102 unsigned char level1[32];
7103 unsigned char level2[512];
7104 unsigned char *mlevel1, *mlevel2, *mlevel3;
7105 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007106 int kind;
7107 void *data;
7108 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007110 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007111 PyErr_BadArgument();
7112 return NULL;
7113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007114 kind = PyUnicode_KIND(string);
7115 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007116 memset(level1, 0xFF, sizeof level1);
7117 memset(level2, 0xFF, sizeof level2);
7118
7119 /* If there isn't a one-to-one mapping of NULL to \0,
7120 or if there are non-BMP characters, we need to use
7121 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007122 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007123 need_dict = 1;
7124 for (i = 1; i < 256; i++) {
7125 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007126 ch = PyUnicode_READ(kind, data, i);
7127 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007128 need_dict = 1;
7129 break;
7130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007131 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007132 /* unmapped character */
7133 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007134 l1 = ch >> 11;
7135 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007136 if (level1[l1] == 0xFF)
7137 level1[l1] = count2++;
7138 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007139 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007140 }
7141
7142 if (count2 >= 0xFF || count3 >= 0xFF)
7143 need_dict = 1;
7144
7145 if (need_dict) {
7146 PyObject *result = PyDict_New();
7147 PyObject *key, *value;
7148 if (!result)
7149 return NULL;
7150 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007151 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007152 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007153 if (!key || !value)
7154 goto failed1;
7155 if (PyDict_SetItem(result, key, value) == -1)
7156 goto failed1;
7157 Py_DECREF(key);
7158 Py_DECREF(value);
7159 }
7160 return result;
7161 failed1:
7162 Py_XDECREF(key);
7163 Py_XDECREF(value);
7164 Py_DECREF(result);
7165 return NULL;
7166 }
7167
7168 /* Create a three-level trie */
7169 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7170 16*count2 + 128*count3 - 1);
7171 if (!result)
7172 return PyErr_NoMemory();
7173 PyObject_Init(result, &EncodingMapType);
7174 mresult = (struct encoding_map*)result;
7175 mresult->count2 = count2;
7176 mresult->count3 = count3;
7177 mlevel1 = mresult->level1;
7178 mlevel2 = mresult->level23;
7179 mlevel3 = mresult->level23 + 16*count2;
7180 memcpy(mlevel1, level1, 32);
7181 memset(mlevel2, 0xFF, 16*count2);
7182 memset(mlevel3, 0, 128*count3);
7183 count3 = 0;
7184 for (i = 1; i < 256; i++) {
7185 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007186 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007187 /* unmapped character */
7188 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007189 o1 = PyUnicode_READ(kind, data, i)>>11;
7190 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007191 i2 = 16*mlevel1[o1] + o2;
7192 if (mlevel2[i2] == 0xFF)
7193 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007195 i3 = 128*mlevel2[i2] + o3;
7196 mlevel3[i3] = i;
7197 }
7198 return result;
7199}
7200
7201static int
7202encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7203{
7204 struct encoding_map *map = (struct encoding_map*)mapping;
7205 int l1 = c>>11;
7206 int l2 = (c>>7) & 0xF;
7207 int l3 = c & 0x7F;
7208 int i;
7209
7210#ifdef Py_UNICODE_WIDE
7211 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007213 }
7214#endif
7215 if (c == 0)
7216 return 0;
7217 /* level 1*/
7218 i = map->level1[l1];
7219 if (i == 0xFF) {
7220 return -1;
7221 }
7222 /* level 2*/
7223 i = map->level23[16*i+l2];
7224 if (i == 0xFF) {
7225 return -1;
7226 }
7227 /* level 3 */
7228 i = map->level23[16*map->count2 + 128*i + l3];
7229 if (i == 0) {
7230 return -1;
7231 }
7232 return i;
7233}
7234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235/* Lookup the character ch in the mapping. If the character
7236 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007237 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238static PyObject *
7239charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240{
Christian Heimes217cfd12007-12-02 14:31:20 +00007241 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242 PyObject *x;
7243
7244 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007246 x = PyObject_GetItem(mapping, w);
7247 Py_DECREF(w);
7248 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7250 /* No mapping found means: mapping is undefined. */
7251 PyErr_Clear();
7252 x = Py_None;
7253 Py_INCREF(x);
7254 return x;
7255 } else
7256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007258 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007260 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 long value = PyLong_AS_LONG(x);
7262 if (value < 0 || value > 255) {
7263 PyErr_SetString(PyExc_TypeError,
7264 "character mapping must be in range(256)");
7265 Py_DECREF(x);
7266 return NULL;
7267 }
7268 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007270 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 /* wrong return value */
7274 PyErr_Format(PyExc_TypeError,
7275 "character mapping must return integer, bytes or None, not %.400s",
7276 x->ob_type->tp_name);
7277 Py_DECREF(x);
7278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 }
7280}
7281
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007282static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007283charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007284{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007285 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7286 /* exponentially overallocate to minimize reallocations */
7287 if (requiredsize < 2*outsize)
7288 requiredsize = 2*outsize;
7289 if (_PyBytes_Resize(outobj, requiredsize))
7290 return -1;
7291 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007292}
7293
Benjamin Peterson14339b62009-01-31 16:36:08 +00007294typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007296} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007298 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 space is available. Return a new reference to the object that
7300 was put in the output buffer, or Py_None, if the mapping was undefined
7301 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007302 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007303static charmapencode_result
7304charmapencode_output(Py_UNICODE c, PyObject *mapping,
7305 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007307 PyObject *rep;
7308 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007309 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310
Christian Heimes90aa7642007-12-19 02:45:37 +00007311 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007312 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007314 if (res == -1)
7315 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 if (outsize<requiredsize)
7317 if (charmapencode_resize(outobj, outpos, requiredsize))
7318 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007319 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 outstart[(*outpos)++] = (char)res;
7321 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007322 }
7323
7324 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007327 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 Py_DECREF(rep);
7329 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007330 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 if (PyLong_Check(rep)) {
7332 Py_ssize_t requiredsize = *outpos+1;
7333 if (outsize<requiredsize)
7334 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7335 Py_DECREF(rep);
7336 return enc_EXCEPTION;
7337 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007338 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 else {
7342 const char *repchars = PyBytes_AS_STRING(rep);
7343 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7344 Py_ssize_t requiredsize = *outpos+repsize;
7345 if (outsize<requiredsize)
7346 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7347 Py_DECREF(rep);
7348 return enc_EXCEPTION;
7349 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007350 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 memcpy(outstart + *outpos, repchars, repsize);
7352 *outpos += repsize;
7353 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007354 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007355 Py_DECREF(rep);
7356 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357}
7358
7359/* handle an error in PyUnicode_EncodeCharmap
7360 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361static int
7362charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007365 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007366 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367{
7368 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 Py_ssize_t repsize;
7370 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007371 Py_UNICODE *uni2;
7372 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007373 Py_ssize_t collstartpos = *inpos;
7374 Py_ssize_t collendpos = *inpos+1;
7375 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007376 char *encoding = "charmap";
7377 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007378 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007380 /* find all unencodable characters */
7381 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007382 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007383 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 int res = encoding_map_lookup(p[collendpos], mapping);
7385 if (res != -1)
7386 break;
7387 ++collendpos;
7388 continue;
7389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007390
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 rep = charmapencode_lookup(p[collendpos], mapping);
7392 if (rep==NULL)
7393 return -1;
7394 else if (rep!=Py_None) {
7395 Py_DECREF(rep);
7396 break;
7397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007398 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007400 }
7401 /* cache callback name lookup
7402 * (if not done yet, i.e. it's the first error) */
7403 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 if ((errors==NULL) || (!strcmp(errors, "strict")))
7405 *known_errorHandler = 1;
7406 else if (!strcmp(errors, "replace"))
7407 *known_errorHandler = 2;
7408 else if (!strcmp(errors, "ignore"))
7409 *known_errorHandler = 3;
7410 else if (!strcmp(errors, "xmlcharrefreplace"))
7411 *known_errorHandler = 4;
7412 else
7413 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007414 }
7415 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007416 case 1: /* strict */
7417 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7418 return -1;
7419 case 2: /* replace */
7420 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 x = charmapencode_output('?', mapping, res, respos);
7422 if (x==enc_EXCEPTION) {
7423 return -1;
7424 }
7425 else if (x==enc_FAILED) {
7426 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7427 return -1;
7428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007429 }
7430 /* fall through */
7431 case 3: /* ignore */
7432 *inpos = collendpos;
7433 break;
7434 case 4: /* xmlcharrefreplace */
7435 /* generate replacement (temporarily (mis)uses p) */
7436 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 char buffer[2+29+1+1];
7438 char *cp;
7439 sprintf(buffer, "&#%d;", (int)p[collpos]);
7440 for (cp = buffer; *cp; ++cp) {
7441 x = charmapencode_output(*cp, mapping, res, respos);
7442 if (x==enc_EXCEPTION)
7443 return -1;
7444 else if (x==enc_FAILED) {
7445 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7446 return -1;
7447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007448 }
7449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007450 *inpos = collendpos;
7451 break;
7452 default:
7453 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 encoding, reason, p, size, exceptionObject,
7455 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007456 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007458 if (PyBytes_Check(repunicode)) {
7459 /* Directly copy bytes result to output. */
7460 Py_ssize_t outsize = PyBytes_Size(*res);
7461 Py_ssize_t requiredsize;
7462 repsize = PyBytes_Size(repunicode);
7463 requiredsize = *respos + repsize;
7464 if (requiredsize > outsize)
7465 /* Make room for all additional bytes. */
7466 if (charmapencode_resize(res, respos, requiredsize)) {
7467 Py_DECREF(repunicode);
7468 return -1;
7469 }
7470 memcpy(PyBytes_AsString(*res) + *respos,
7471 PyBytes_AsString(repunicode), repsize);
7472 *respos += repsize;
7473 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007474 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007475 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 /* generate replacement */
7478 repsize = PyUnicode_GET_SIZE(repunicode);
7479 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 x = charmapencode_output(*uni2, mapping, res, respos);
7481 if (x==enc_EXCEPTION) {
7482 return -1;
7483 }
7484 else if (x==enc_FAILED) {
7485 Py_DECREF(repunicode);
7486 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7487 return -1;
7488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007489 }
7490 *inpos = newpos;
7491 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492 }
7493 return 0;
7494}
7495
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496PyObject *
7497PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7498 Py_ssize_t size,
7499 PyObject *mapping,
7500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 /* output object */
7503 PyObject *res = NULL;
7504 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 PyObject *errorHandler = NULL;
7509 PyObject *exc = NULL;
7510 /* the following variable is used for caching string comparisons
7511 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7512 * 3=ignore, 4=xmlcharrefreplace */
7513 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515 /* Default to Latin-1 */
7516 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519 /* allocate enough for a simple encoding without
7520 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007521 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007522 if (res == NULL)
7523 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007524 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* try to encode it */
7529 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7530 if (x==enc_EXCEPTION) /* error */
7531 goto onError;
7532 if (x==enc_FAILED) { /* unencodable character */
7533 if (charmap_encoding_error(p, size, &inpos, mapping,
7534 &exc,
7535 &known_errorHandler, &errorHandler, errors,
7536 &res, &respos)) {
7537 goto onError;
7538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 else
7541 /* done with this character => adjust input position */
7542 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007546 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007547 if (_PyBytes_Resize(&res, respos) < 0)
7548 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 Py_XDECREF(exc);
7551 Py_XDECREF(errorHandler);
7552 return res;
7553
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555 Py_XDECREF(res);
7556 Py_XDECREF(exc);
7557 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 return NULL;
7559}
7560
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561PyObject *
7562PyUnicode_AsCharmapString(PyObject *unicode,
7563 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
7565 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 PyErr_BadArgument();
7567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 }
7569 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 PyUnicode_GET_SIZE(unicode),
7571 mapping,
7572 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573}
7574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007576static void
7577make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007578 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007579 Py_ssize_t startpos, Py_ssize_t endpos,
7580 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007583 *exceptionObject = _PyUnicodeTranslateError_Create(
7584 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 }
7586 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7588 goto onError;
7589 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7590 goto onError;
7591 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7592 goto onError;
7593 return;
7594 onError:
7595 Py_DECREF(*exceptionObject);
7596 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 }
7598}
7599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007601static void
7602raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007603 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007604 Py_ssize_t startpos, Py_ssize_t endpos,
7605 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007606{
7607 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007608 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007609 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611}
7612
7613/* error handling callback helper:
7614 build arguments, call the callback and check the arguments,
7615 put the result into newpos and return the replacement string, which
7616 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007617static PyObject *
7618unicode_translate_call_errorhandler(const char *errors,
7619 PyObject **errorHandler,
7620 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007621 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007622 Py_ssize_t startpos, Py_ssize_t endpos,
7623 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007625 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007627 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 PyObject *restuple;
7629 PyObject *resunicode;
7630
7631 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635 }
7636
7637 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007638 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641
7642 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007644 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007646 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007647 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 Py_DECREF(restuple);
7649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 }
7651 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 &resunicode, &i_newpos)) {
7653 Py_DECREF(restuple);
7654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007655 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007656 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007657 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007658 else
7659 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007660 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7662 Py_DECREF(restuple);
7663 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 Py_INCREF(resunicode);
7666 Py_DECREF(restuple);
7667 return resunicode;
7668}
7669
7670/* Lookup the character ch in the mapping and put the result in result,
7671 which must be decrefed by the caller.
7672 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007673static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007674charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675{
Christian Heimes217cfd12007-12-02 14:31:20 +00007676 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007677 PyObject *x;
7678
7679 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681 x = PyObject_GetItem(mapping, w);
7682 Py_DECREF(w);
7683 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7685 /* No mapping found means: use 1:1 mapping. */
7686 PyErr_Clear();
7687 *result = NULL;
7688 return 0;
7689 } else
7690 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691 }
7692 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 *result = x;
7694 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007696 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 long value = PyLong_AS_LONG(x);
7698 long max = PyUnicode_GetMax();
7699 if (value < 0 || value > max) {
7700 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007701 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 Py_DECREF(x);
7703 return -1;
7704 }
7705 *result = x;
7706 return 0;
7707 }
7708 else if (PyUnicode_Check(x)) {
7709 *result = x;
7710 return 0;
7711 }
7712 else {
7713 /* wrong return value */
7714 PyErr_SetString(PyExc_TypeError,
7715 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716 Py_DECREF(x);
7717 return -1;
7718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719}
7720/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 if not reallocate and adjust various state variables.
7722 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007723static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007728 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 /* exponentially overallocate to minimize reallocations */
7730 if (requiredsize < 2 * oldsize)
7731 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7733 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 }
7737 return 0;
7738}
7739/* lookup the character, put the result in the output string and adjust
7740 various state variables. Return a new reference to the object that
7741 was put in the output buffer in *result, or Py_None, if the mapping was
7742 undefined (in which case no character was written).
7743 The called must decref result.
7744 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007745static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7747 PyObject *mapping, Py_UCS4 **output,
7748 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007751 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7752 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007754 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007756 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 }
7758 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007760 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007762 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 }
7764 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007765 Py_ssize_t repsize;
7766 if (PyUnicode_READY(*res) == -1)
7767 return -1;
7768 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 if (repsize==1) {
7770 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 }
7773 else if (repsize!=0) {
7774 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007775 Py_ssize_t requiredsize = *opos +
7776 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007778 Py_ssize_t i;
7779 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007781 for(i = 0; i < repsize; i++)
7782 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007784 }
7785 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787 return 0;
7788}
7789
Alexander Belopolsky40018472011-02-26 01:02:56 +00007790PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791_PyUnicode_TranslateCharmap(PyObject *input,
7792 PyObject *mapping,
7793 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 /* input object */
7796 char *idata;
7797 Py_ssize_t size, i;
7798 int kind;
7799 /* output buffer */
7800 Py_UCS4 *output = NULL;
7801 Py_ssize_t osize;
7802 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007803 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007805 char *reason = "character maps to <undefined>";
7806 PyObject *errorHandler = NULL;
7807 PyObject *exc = NULL;
7808 /* the following variable is used for caching string comparisons
7809 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7810 * 3=ignore, 4=xmlcharrefreplace */
7811 int known_errorHandler = -1;
7812
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 PyErr_BadArgument();
7815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 if (PyUnicode_READY(input) == -1)
7819 return NULL;
7820 idata = (char*)PyUnicode_DATA(input);
7821 kind = PyUnicode_KIND(input);
7822 size = PyUnicode_GET_LENGTH(input);
7823 i = 0;
7824
7825 if (size == 0) {
7826 Py_INCREF(input);
7827 return input;
7828 }
7829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 /* allocate enough for a simple 1:1 translation without
7831 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 osize = size;
7833 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7834 opos = 0;
7835 if (output == NULL) {
7836 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 /* try to encode it */
7842 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 if (charmaptranslate_output(input, i, mapping,
7844 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 Py_XDECREF(x);
7846 goto onError;
7847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007848 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 else { /* untranslatable character */
7852 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7853 Py_ssize_t repsize;
7854 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 Py_ssize_t collstart = i;
7858 Py_ssize_t collend = i+1;
7859 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 while (collend < size) {
7863 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 goto onError;
7865 Py_XDECREF(x);
7866 if (x!=Py_None)
7867 break;
7868 ++collend;
7869 }
7870 /* cache callback name lookup
7871 * (if not done yet, i.e. it's the first error) */
7872 if (known_errorHandler==-1) {
7873 if ((errors==NULL) || (!strcmp(errors, "strict")))
7874 known_errorHandler = 1;
7875 else if (!strcmp(errors, "replace"))
7876 known_errorHandler = 2;
7877 else if (!strcmp(errors, "ignore"))
7878 known_errorHandler = 3;
7879 else if (!strcmp(errors, "xmlcharrefreplace"))
7880 known_errorHandler = 4;
7881 else
7882 known_errorHandler = 0;
7883 }
7884 switch (known_errorHandler) {
7885 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 raise_translate_exception(&exc, input, collstart,
7887 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 case 2: /* replace */
7890 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 for (coll = collstart; coll<collend; coll++)
7892 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 /* fall through */
7894 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 break;
7897 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 /* generate replacement (temporarily (mis)uses i) */
7899 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 char buffer[2+29+1+1];
7901 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7903 if (charmaptranslate_makespace(&output, &osize,
7904 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 goto onError;
7906 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 break;
7911 default:
7912 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 reason, input, &exc,
7914 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007915 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 goto onError;
7917 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 repsize = PyUnicode_GET_LENGTH(repunicode);
7919 if (charmaptranslate_makespace(&output, &osize,
7920 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 Py_DECREF(repunicode);
7922 goto onError;
7923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 for (uni2 = 0; repsize-->0; ++uni2)
7925 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7926 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007928 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929 }
7930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7932 if (!res)
7933 goto onError;
7934 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 Py_XDECREF(exc);
7936 Py_XDECREF(errorHandler);
7937 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007940 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 Py_XDECREF(exc);
7942 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 return NULL;
7944}
7945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946/* Deprecated. Use PyUnicode_Translate instead. */
7947PyObject *
7948PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7949 Py_ssize_t size,
7950 PyObject *mapping,
7951 const char *errors)
7952{
7953 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7954 if (!unicode)
7955 return NULL;
7956 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7957}
7958
Alexander Belopolsky40018472011-02-26 01:02:56 +00007959PyObject *
7960PyUnicode_Translate(PyObject *str,
7961 PyObject *mapping,
7962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963{
7964 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007965
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 str = PyUnicode_FromObject(str);
7967 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 Py_DECREF(str);
7971 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007972
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 Py_XDECREF(str);
7975 return NULL;
7976}
Tim Petersced69f82003-09-16 20:30:58 +00007977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978static Py_UCS4
7979fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7980{
7981 /* No need to call PyUnicode_READY(self) because this function is only
7982 called as a callback from fixup() which does it already. */
7983 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7984 const int kind = PyUnicode_KIND(self);
7985 void *data = PyUnicode_DATA(self);
7986 Py_UCS4 maxchar = 0, ch, fixed;
7987 Py_ssize_t i;
7988
7989 for (i = 0; i < len; ++i) {
7990 ch = PyUnicode_READ(kind, data, i);
7991 fixed = 0;
7992 if (ch > 127) {
7993 if (Py_UNICODE_ISSPACE(ch))
7994 fixed = ' ';
7995 else {
7996 const int decimal = Py_UNICODE_TODECIMAL(ch);
7997 if (decimal >= 0)
7998 fixed = '0' + decimal;
7999 }
8000 if (fixed != 0) {
8001 if (fixed > maxchar)
8002 maxchar = fixed;
8003 PyUnicode_WRITE(kind, data, i, fixed);
8004 }
8005 else if (ch > maxchar)
8006 maxchar = ch;
8007 }
8008 else if (ch > maxchar)
8009 maxchar = ch;
8010 }
8011
8012 return maxchar;
8013}
8014
8015PyObject *
8016_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8017{
8018 if (!PyUnicode_Check(unicode)) {
8019 PyErr_BadInternalCall();
8020 return NULL;
8021 }
8022 if (PyUnicode_READY(unicode) == -1)
8023 return NULL;
8024 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8025 /* If the string is already ASCII, just return the same string */
8026 Py_INCREF(unicode);
8027 return unicode;
8028 }
8029 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8030}
8031
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008032PyObject *
8033PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8034 Py_ssize_t length)
8035{
8036 PyObject *result;
8037 Py_UNICODE *p; /* write pointer into result */
8038 Py_ssize_t i;
8039 /* Copy to a new string */
8040 result = (PyObject *)_PyUnicode_New(length);
8041 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8042 if (result == NULL)
8043 return result;
8044 p = PyUnicode_AS_UNICODE(result);
8045 /* Iterate over code points */
8046 for (i = 0; i < length; i++) {
8047 Py_UNICODE ch =s[i];
8048 if (ch > 127) {
8049 int decimal = Py_UNICODE_TODECIMAL(ch);
8050 if (decimal >= 0)
8051 p[i] = '0' + decimal;
8052 }
8053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8055 Py_DECREF(result);
8056 return NULL;
8057 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008058 return result;
8059}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008060/* --- Decimal Encoder ---------------------------------------------------- */
8061
Alexander Belopolsky40018472011-02-26 01:02:56 +00008062int
8063PyUnicode_EncodeDecimal(Py_UNICODE *s,
8064 Py_ssize_t length,
8065 char *output,
8066 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008067{
8068 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069 PyObject *errorHandler = NULL;
8070 PyObject *exc = NULL;
8071 const char *encoding = "decimal";
8072 const char *reason = "invalid decimal Unicode string";
8073 /* the following variable is used for caching string comparisons
8074 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8075 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008076
8077 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 PyErr_BadArgument();
8079 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008080 }
8081
8082 p = s;
8083 end = s + length;
8084 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 register Py_UNICODE ch = *p;
8086 int decimal;
8087 PyObject *repunicode;
8088 Py_ssize_t repsize;
8089 Py_ssize_t newpos;
8090 Py_UNICODE *uni2;
8091 Py_UNICODE *collstart;
8092 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008093
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 ++p;
8097 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 decimal = Py_UNICODE_TODECIMAL(ch);
8100 if (decimal >= 0) {
8101 *output++ = '0' + decimal;
8102 ++p;
8103 continue;
8104 }
8105 if (0 < ch && ch < 256) {
8106 *output++ = (char)ch;
8107 ++p;
8108 continue;
8109 }
8110 /* All other characters are considered unencodable */
8111 collstart = p;
8112 collend = p+1;
8113 while (collend < end) {
8114 if ((0 < *collend && *collend < 256) ||
8115 !Py_UNICODE_ISSPACE(*collend) ||
8116 Py_UNICODE_TODECIMAL(*collend))
8117 break;
8118 }
8119 /* cache callback name lookup
8120 * (if not done yet, i.e. it's the first error) */
8121 if (known_errorHandler==-1) {
8122 if ((errors==NULL) || (!strcmp(errors, "strict")))
8123 known_errorHandler = 1;
8124 else if (!strcmp(errors, "replace"))
8125 known_errorHandler = 2;
8126 else if (!strcmp(errors, "ignore"))
8127 known_errorHandler = 3;
8128 else if (!strcmp(errors, "xmlcharrefreplace"))
8129 known_errorHandler = 4;
8130 else
8131 known_errorHandler = 0;
8132 }
8133 switch (known_errorHandler) {
8134 case 1: /* strict */
8135 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8136 goto onError;
8137 case 2: /* replace */
8138 for (p = collstart; p < collend; ++p)
8139 *output++ = '?';
8140 /* fall through */
8141 case 3: /* ignore */
8142 p = collend;
8143 break;
8144 case 4: /* xmlcharrefreplace */
8145 /* generate replacement (temporarily (mis)uses p) */
8146 for (p = collstart; p < collend; ++p)
8147 output += sprintf(output, "&#%d;", (int)*p);
8148 p = collend;
8149 break;
8150 default:
8151 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8152 encoding, reason, s, length, &exc,
8153 collstart-s, collend-s, &newpos);
8154 if (repunicode == NULL)
8155 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008156 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008157 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008158 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8159 Py_DECREF(repunicode);
8160 goto onError;
8161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 /* generate replacement */
8163 repsize = PyUnicode_GET_SIZE(repunicode);
8164 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8165 Py_UNICODE ch = *uni2;
8166 if (Py_UNICODE_ISSPACE(ch))
8167 *output++ = ' ';
8168 else {
8169 decimal = Py_UNICODE_TODECIMAL(ch);
8170 if (decimal >= 0)
8171 *output++ = '0' + decimal;
8172 else if (0 < ch && ch < 256)
8173 *output++ = (char)ch;
8174 else {
8175 Py_DECREF(repunicode);
8176 raise_encode_exception(&exc, encoding,
8177 s, length, collstart-s, collend-s, reason);
8178 goto onError;
8179 }
8180 }
8181 }
8182 p = s + newpos;
8183 Py_DECREF(repunicode);
8184 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008185 }
8186 /* 0-terminate the output string */
8187 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188 Py_XDECREF(exc);
8189 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008190 return 0;
8191
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 Py_XDECREF(exc);
8194 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008195 return -1;
8196}
8197
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198/* --- Helpers ------------------------------------------------------------ */
8199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008200#include "stringlib/ucs1lib.h"
8201#include "stringlib/fastsearch.h"
8202#include "stringlib/partition.h"
8203#include "stringlib/split.h"
8204#include "stringlib/count.h"
8205#include "stringlib/find.h"
8206#include "stringlib/localeutil.h"
8207#include "stringlib/undef.h"
8208
8209#include "stringlib/ucs2lib.h"
8210#include "stringlib/fastsearch.h"
8211#include "stringlib/partition.h"
8212#include "stringlib/split.h"
8213#include "stringlib/count.h"
8214#include "stringlib/find.h"
8215#include "stringlib/localeutil.h"
8216#include "stringlib/undef.h"
8217
8218#include "stringlib/ucs4lib.h"
8219#include "stringlib/fastsearch.h"
8220#include "stringlib/partition.h"
8221#include "stringlib/split.h"
8222#include "stringlib/count.h"
8223#include "stringlib/find.h"
8224#include "stringlib/localeutil.h"
8225#include "stringlib/undef.h"
8226
8227static Py_ssize_t
8228any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8229 const Py_UCS1*, Py_ssize_t,
8230 Py_ssize_t, Py_ssize_t),
8231 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8232 const Py_UCS2*, Py_ssize_t,
8233 Py_ssize_t, Py_ssize_t),
8234 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8235 const Py_UCS4*, Py_ssize_t,
8236 Py_ssize_t, Py_ssize_t),
8237 PyObject* s1, PyObject* s2,
8238 Py_ssize_t start,
8239 Py_ssize_t end)
8240{
8241 int kind1, kind2, kind;
8242 void *buf1, *buf2;
8243 Py_ssize_t len1, len2, result;
8244
8245 kind1 = PyUnicode_KIND(s1);
8246 kind2 = PyUnicode_KIND(s2);
8247 kind = kind1 > kind2 ? kind1 : kind2;
8248 buf1 = PyUnicode_DATA(s1);
8249 buf2 = PyUnicode_DATA(s2);
8250 if (kind1 != kind)
8251 buf1 = _PyUnicode_AsKind(s1, kind);
8252 if (!buf1)
8253 return -2;
8254 if (kind2 != kind)
8255 buf2 = _PyUnicode_AsKind(s2, kind);
8256 if (!buf2) {
8257 if (kind1 != kind) PyMem_Free(buf1);
8258 return -2;
8259 }
8260 len1 = PyUnicode_GET_LENGTH(s1);
8261 len2 = PyUnicode_GET_LENGTH(s2);
8262
8263 switch(kind) {
8264 case PyUnicode_1BYTE_KIND:
8265 result = ucs1(buf1, len1, buf2, len2, start, end);
8266 break;
8267 case PyUnicode_2BYTE_KIND:
8268 result = ucs2(buf1, len1, buf2, len2, start, end);
8269 break;
8270 case PyUnicode_4BYTE_KIND:
8271 result = ucs4(buf1, len1, buf2, len2, start, end);
8272 break;
8273 default:
8274 assert(0); result = -2;
8275 }
8276
8277 if (kind1 != kind)
8278 PyMem_Free(buf1);
8279 if (kind2 != kind)
8280 PyMem_Free(buf2);
8281
8282 return result;
8283}
8284
8285Py_ssize_t
8286_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8287 Py_ssize_t n_buffer,
8288 void *digits, Py_ssize_t n_digits,
8289 Py_ssize_t min_width,
8290 const char *grouping,
8291 const char *thousands_sep)
8292{
8293 switch(kind) {
8294 case PyUnicode_1BYTE_KIND:
8295 return _PyUnicode_ucs1_InsertThousandsGrouping(
8296 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8297 min_width, grouping, thousands_sep);
8298 case PyUnicode_2BYTE_KIND:
8299 return _PyUnicode_ucs2_InsertThousandsGrouping(
8300 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8301 min_width, grouping, thousands_sep);
8302 case PyUnicode_4BYTE_KIND:
8303 return _PyUnicode_ucs4_InsertThousandsGrouping(
8304 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8305 min_width, grouping, thousands_sep);
8306 }
8307 assert(0);
8308 return -1;
8309}
8310
8311
Eric Smith8c663262007-08-25 02:26:07 +00008312#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008313#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008314
Thomas Wouters477c8d52006-05-27 19:21:47 +00008315#include "stringlib/count.h"
8316#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008317
Thomas Wouters477c8d52006-05-27 19:21:47 +00008318/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008319#define ADJUST_INDICES(start, end, len) \
8320 if (end > len) \
8321 end = len; \
8322 else if (end < 0) { \
8323 end += len; \
8324 if (end < 0) \
8325 end = 0; \
8326 } \
8327 if (start < 0) { \
8328 start += len; \
8329 if (start < 0) \
8330 start = 0; \
8331 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008332
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333Py_ssize_t
8334PyUnicode_Count(PyObject *str,
8335 PyObject *substr,
8336 Py_ssize_t start,
8337 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008339 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008340 PyUnicodeObject* str_obj;
8341 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 int kind1, kind2, kind;
8343 void *buf1 = NULL, *buf2 = NULL;
8344 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008345
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008349 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008350 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 Py_DECREF(str_obj);
8352 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
Tim Petersced69f82003-09-16 20:30:58 +00008354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 kind1 = PyUnicode_KIND(str_obj);
8356 kind2 = PyUnicode_KIND(sub_obj);
8357 kind = kind1 > kind2 ? kind1 : kind2;
8358 buf1 = PyUnicode_DATA(str_obj);
8359 if (kind1 != kind)
8360 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8361 if (!buf1)
8362 goto onError;
8363 buf2 = PyUnicode_DATA(sub_obj);
8364 if (kind2 != kind)
8365 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8366 if (!buf2)
8367 goto onError;
8368 len1 = PyUnicode_GET_LENGTH(str_obj);
8369 len2 = PyUnicode_GET_LENGTH(sub_obj);
8370
8371 ADJUST_INDICES(start, end, len1);
8372 switch(kind) {
8373 case PyUnicode_1BYTE_KIND:
8374 result = ucs1lib_count(
8375 ((Py_UCS1*)buf1) + start, end - start,
8376 buf2, len2, PY_SSIZE_T_MAX
8377 );
8378 break;
8379 case PyUnicode_2BYTE_KIND:
8380 result = ucs2lib_count(
8381 ((Py_UCS2*)buf1) + start, end - start,
8382 buf2, len2, PY_SSIZE_T_MAX
8383 );
8384 break;
8385 case PyUnicode_4BYTE_KIND:
8386 result = ucs4lib_count(
8387 ((Py_UCS4*)buf1) + start, end - start,
8388 buf2, len2, PY_SSIZE_T_MAX
8389 );
8390 break;
8391 default:
8392 assert(0); result = 0;
8393 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394
8395 Py_DECREF(sub_obj);
8396 Py_DECREF(str_obj);
8397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 if (kind1 != kind)
8399 PyMem_Free(buf1);
8400 if (kind2 != kind)
8401 PyMem_Free(buf2);
8402
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 onError:
8405 Py_DECREF(sub_obj);
8406 Py_DECREF(str_obj);
8407 if (kind1 != kind && buf1)
8408 PyMem_Free(buf1);
8409 if (kind2 != kind && buf2)
8410 PyMem_Free(buf2);
8411 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412}
8413
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414Py_ssize_t
8415PyUnicode_Find(PyObject *str,
8416 PyObject *sub,
8417 Py_ssize_t start,
8418 Py_ssize_t end,
8419 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008422
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008426 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 Py_DECREF(str);
8429 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 }
Tim Petersced69f82003-09-16 20:30:58 +00008431
Thomas Wouters477c8d52006-05-27 19:21:47 +00008432 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 result = any_find_slice(
8434 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8435 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008436 );
8437 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 result = any_find_slice(
8439 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8440 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008441 );
8442
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008444 Py_DECREF(sub);
8445
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 return result;
8447}
8448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449Py_ssize_t
8450PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8451 Py_ssize_t start, Py_ssize_t end,
8452 int direction)
8453{
8454 char *result;
8455 int kind;
8456 if (PyUnicode_READY(str) == -1)
8457 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008458 if (start < 0 || end < 0) {
8459 PyErr_SetString(PyExc_IndexError, "string index out of range");
8460 return -2;
8461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 if (end > PyUnicode_GET_LENGTH(str))
8463 end = PyUnicode_GET_LENGTH(str);
8464 kind = PyUnicode_KIND(str);
8465 result = findchar(PyUnicode_1BYTE_DATA(str)
8466 + PyUnicode_KIND_SIZE(kind, start),
8467 kind,
8468 end-start, ch, direction);
8469 if (!result)
8470 return -1;
8471 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8472}
8473
Alexander Belopolsky40018472011-02-26 01:02:56 +00008474static int
8475tailmatch(PyUnicodeObject *self,
8476 PyUnicodeObject *substring,
8477 Py_ssize_t start,
8478 Py_ssize_t end,
8479 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 int kind_self;
8482 int kind_sub;
8483 void *data_self;
8484 void *data_sub;
8485 Py_ssize_t offset;
8486 Py_ssize_t i;
8487 Py_ssize_t end_sub;
8488
8489 if (PyUnicode_READY(self) == -1 ||
8490 PyUnicode_READY(substring) == -1)
8491 return 0;
8492
8493 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 return 1;
8495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8497 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 kind_self = PyUnicode_KIND(self);
8502 data_self = PyUnicode_DATA(self);
8503 kind_sub = PyUnicode_KIND(substring);
8504 data_sub = PyUnicode_DATA(substring);
8505 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8506
8507 if (direction > 0)
8508 offset = end;
8509 else
8510 offset = start;
8511
8512 if (PyUnicode_READ(kind_self, data_self, offset) ==
8513 PyUnicode_READ(kind_sub, data_sub, 0) &&
8514 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8515 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8516 /* If both are of the same kind, memcmp is sufficient */
8517 if (kind_self == kind_sub) {
8518 return ! memcmp((char *)data_self +
8519 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8520 data_sub,
8521 PyUnicode_GET_LENGTH(substring) *
8522 PyUnicode_CHARACTER_SIZE(substring));
8523 }
8524 /* otherwise we have to compare each character by first accesing it */
8525 else {
8526 /* We do not need to compare 0 and len(substring)-1 because
8527 the if statement above ensured already that they are equal
8528 when we end up here. */
8529 // TODO: honor direction and do a forward or backwards search
8530 for (i = 1; i < end_sub; ++i) {
8531 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8532 PyUnicode_READ(kind_sub, data_sub, i))
8533 return 0;
8534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 }
8538
8539 return 0;
8540}
8541
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542Py_ssize_t
8543PyUnicode_Tailmatch(PyObject *str,
8544 PyObject *substr,
8545 Py_ssize_t start,
8546 Py_ssize_t end,
8547 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008550
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 str = PyUnicode_FromObject(str);
8552 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 substr = PyUnicode_FromObject(substr);
8555 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 Py_DECREF(str);
8557 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 }
Tim Petersced69f82003-09-16 20:30:58 +00008559
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 (PyUnicodeObject *)substr,
8562 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 Py_DECREF(str);
8564 Py_DECREF(substr);
8565 return result;
8566}
8567
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568/* Apply fixfct filter to the Unicode object self and return a
8569 reference to the modified object */
8570
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571static PyObject *
8572fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 PyObject *u;
8576 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 if (PyUnicode_READY(self) == -1)
8579 return NULL;
8580 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8581 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8582 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8587 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 /* fix functions return the new maximum character in a string,
8590 if the kind of the resulting unicode object does not change,
8591 everything is fine. Otherwise we need to change the string kind
8592 and re-run the fix function. */
8593 maxchar_new = fixfct((PyUnicodeObject*)u);
8594 if (maxchar_new == 0)
8595 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8596 else if (maxchar_new <= 127)
8597 maxchar_new = 127;
8598 else if (maxchar_new <= 255)
8599 maxchar_new = 255;
8600 else if (maxchar_new <= 65535)
8601 maxchar_new = 65535;
8602 else
8603 maxchar_new = 1114111; /* 0x10ffff */
8604
8605 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 /* fixfct should return TRUE if it modified the buffer. If
8607 FALSE, return a reference to the original buffer instead
8608 (to save space, not time) */
8609 Py_INCREF(self);
8610 Py_DECREF(u);
8611 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 else if (maxchar_new == maxchar_old) {
8614 return u;
8615 }
8616 else {
8617 /* In case the maximum character changed, we need to
8618 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008619 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 if (v == NULL) {
8621 Py_DECREF(u);
8622 return NULL;
8623 }
8624 if (maxchar_new > maxchar_old) {
8625 /* If the maxchar increased so that the kind changed, not all
8626 characters are representable anymore and we need to fix the
8627 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008628 if (PyUnicode_CopyCharacters(v, 0,
8629 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008630 PyUnicode_GET_LENGTH(self)) < 0)
8631 {
8632 Py_DECREF(u);
8633 return NULL;
8634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 maxchar_old = fixfct((PyUnicodeObject*)v);
8636 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8637 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008638 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008639 if (PyUnicode_CopyCharacters(v, 0,
8640 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008641 PyUnicode_GET_LENGTH(self)) < 0)
8642 {
8643 Py_DECREF(u);
8644 return NULL;
8645 }
8646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647
8648 Py_DECREF(u);
8649 return v;
8650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651}
8652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 /* No need to call PyUnicode_READY(self) because this function is only
8657 called as a callback from fixup() which does it already. */
8658 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8659 const int kind = PyUnicode_KIND(self);
8660 void *data = PyUnicode_DATA(self);
8661 int touched = 0;
8662 Py_UCS4 maxchar = 0;
8663 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 for (i = 0; i < len; ++i) {
8666 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8667 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8668 if (up != ch) {
8669 if (up > maxchar)
8670 maxchar = up;
8671 PyUnicode_WRITE(kind, data, i, up);
8672 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 else if (ch > maxchar)
8675 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
8677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 if (touched)
8679 return maxchar;
8680 else
8681 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682}
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008685fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8688 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8689 const int kind = PyUnicode_KIND(self);
8690 void *data = PyUnicode_DATA(self);
8691 int touched = 0;
8692 Py_UCS4 maxchar = 0;
8693 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 for(i = 0; i < len; ++i) {
8696 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8697 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8698 if (lo != ch) {
8699 if (lo > maxchar)
8700 maxchar = lo;
8701 PyUnicode_WRITE(kind, data, i, lo);
8702 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 else if (ch > maxchar)
8705 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 }
8707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 if (touched)
8709 return maxchar;
8710 else
8711 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712}
8713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8718 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8719 const int kind = PyUnicode_KIND(self);
8720 void *data = PyUnicode_DATA(self);
8721 int touched = 0;
8722 Py_UCS4 maxchar = 0;
8723 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 for(i = 0; i < len; ++i) {
8726 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8727 Py_UCS4 nu = 0;
8728
8729 if (Py_UNICODE_ISUPPER(ch))
8730 nu = Py_UNICODE_TOLOWER(ch);
8731 else if (Py_UNICODE_ISLOWER(ch))
8732 nu = Py_UNICODE_TOUPPER(ch);
8733
8734 if (nu != 0) {
8735 if (nu > maxchar)
8736 maxchar = nu;
8737 PyUnicode_WRITE(kind, data, i, nu);
8738 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 else if (ch > maxchar)
8741 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 }
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 if (touched)
8745 return maxchar;
8746 else
8747 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748}
8749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008751fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8754 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8755 const int kind = PyUnicode_KIND(self);
8756 void *data = PyUnicode_DATA(self);
8757 int touched = 0;
8758 Py_UCS4 maxchar = 0;
8759 Py_ssize_t i = 0;
8760 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008761
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008762 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764
8765 ch = PyUnicode_READ(kind, data, i);
8766 if (!Py_UNICODE_ISUPPER(ch)) {
8767 maxchar = Py_UNICODE_TOUPPER(ch);
8768 PyUnicode_WRITE(kind, data, i, maxchar);
8769 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 ++i;
8772 for(; i < len; ++i) {
8773 ch = PyUnicode_READ(kind, data, i);
8774 if (!Py_UNICODE_ISLOWER(ch)) {
8775 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8776 if (lo > maxchar)
8777 maxchar = lo;
8778 PyUnicode_WRITE(kind, data, i, lo);
8779 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 else if (ch > maxchar)
8782 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784
8785 if (touched)
8786 return maxchar;
8787 else
8788 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789}
8790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008792fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8795 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8796 const int kind = PyUnicode_KIND(self);
8797 void *data = PyUnicode_DATA(self);
8798 Py_UCS4 maxchar = 0;
8799 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 int previous_is_cased;
8801
8802 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 if (len == 1) {
8804 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8805 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8806 if (ti != ch) {
8807 PyUnicode_WRITE(kind, data, i, ti);
8808 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 }
8810 else
8811 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 for(; i < len; ++i) {
8815 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8816 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008817
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 nu = Py_UNICODE_TOTITLE(ch);
8822
8823 if (nu > maxchar)
8824 maxchar = nu;
8825 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008826
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 if (Py_UNICODE_ISLOWER(ch) ||
8828 Py_UNICODE_ISUPPER(ch) ||
8829 Py_UNICODE_ISTITLE(ch))
8830 previous_is_cased = 1;
8831 else
8832 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835}
8836
Tim Peters8ce9f162004-08-27 01:49:32 +00008837PyObject *
8838PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008841 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008843 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008844 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8845 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008846 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 Py_ssize_t sz, i, res_offset;
8848 Py_UCS4 maxchar = 0;
8849 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850
Tim Peters05eba1f2004-08-27 21:32:02 +00008851 fseq = PySequence_Fast(seq, "");
8852 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008853 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008854 }
8855
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008856 /* NOTE: the following code can't call back into Python code,
8857 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008858 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008859
Tim Peters05eba1f2004-08-27 21:32:02 +00008860 seqlen = PySequence_Fast_GET_SIZE(fseq);
8861 /* If empty sequence, return u"". */
8862 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008864 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008865 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008866 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008867 /* If singleton sequence with an exact Unicode, return that. */
8868 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 item = items[0];
8870 if (PyUnicode_CheckExact(item)) {
8871 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 goto Done;
8874 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008875 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008876 else {
8877 /* Set up sep and seplen */
8878 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 /* fall back to a blank space separator */
8880 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008881 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008883 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008884 else {
8885 if (!PyUnicode_Check(separator)) {
8886 PyErr_Format(PyExc_TypeError,
8887 "separator: expected str instance,"
8888 " %.80s found",
8889 Py_TYPE(separator)->tp_name);
8890 goto onError;
8891 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008892 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 goto onError;
8894 sep = separator;
8895 seplen = PyUnicode_GET_LENGTH(separator);
8896 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8897 /* inc refcount to keep this code path symetric with the
8898 above case of a blank separator */
8899 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008900 }
8901 }
8902
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008903 /* There are at least two things to join, or else we have a subclass
8904 * of str in the sequence.
8905 * Do a pre-pass to figure out the total amount of space we'll
8906 * need (sz), and see whether all argument are strings.
8907 */
8908 sz = 0;
8909 for (i = 0; i < seqlen; i++) {
8910 const Py_ssize_t old_sz = sz;
8911 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 if (!PyUnicode_Check(item)) {
8913 PyErr_Format(PyExc_TypeError,
8914 "sequence item %zd: expected str instance,"
8915 " %.80s found",
8916 i, Py_TYPE(item)->tp_name);
8917 goto onError;
8918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 if (PyUnicode_READY(item) == -1)
8920 goto onError;
8921 sz += PyUnicode_GET_LENGTH(item);
8922 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8923 if (item_maxchar > maxchar)
8924 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008925 if (i != 0)
8926 sz += seplen;
8927 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8928 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008930 goto onError;
8931 }
8932 }
Tim Petersced69f82003-09-16 20:30:58 +00008933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008935 if (res == NULL)
8936 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008937
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008938 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008940 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008941 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008943 if (i && seplen != 0) {
8944 copied = PyUnicode_CopyCharacters(res, res_offset,
8945 sep, 0, seplen);
8946 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008947 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008948#ifdef Py_DEBUG
8949 res_offset += copied;
8950#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008952#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008954 itemlen = PyUnicode_GET_LENGTH(item);
8955 if (itemlen != 0) {
8956 copied = PyUnicode_CopyCharacters(res, res_offset,
8957 item, 0, itemlen);
8958 if (copied < 0)
8959 goto onError;
8960#ifdef Py_DEBUG
8961 res_offset += copied;
8962#else
8963 res_offset += itemlen;
8964#endif
8965 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008968
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008970 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 Py_XDECREF(sep);
8972 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008975 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008977 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 return NULL;
8979}
8980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981#define FILL(kind, data, value, start, length) \
8982 do { \
8983 Py_ssize_t i_ = 0; \
8984 assert(kind != PyUnicode_WCHAR_KIND); \
8985 switch ((kind)) { \
8986 case PyUnicode_1BYTE_KIND: { \
8987 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8988 memset(to_, (unsigned char)value, length); \
8989 break; \
8990 } \
8991 case PyUnicode_2BYTE_KIND: { \
8992 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8993 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8994 break; \
8995 } \
8996 default: { \
8997 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8998 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8999 break; \
9000 } \
9001 } \
9002 } while (0)
9003
Alexander Belopolsky40018472011-02-26 01:02:56 +00009004static PyUnicodeObject *
9005pad(PyUnicodeObject *self,
9006 Py_ssize_t left,
9007 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 PyObject *u;
9011 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009012 int kind;
9013 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014
9015 if (left < 0)
9016 left = 0;
9017 if (right < 0)
9018 right = 0;
9019
Tim Peters7a29bd52001-09-12 03:03:31 +00009020 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 Py_INCREF(self);
9022 return self;
9023 }
9024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9026 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009027 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9028 return NULL;
9029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9031 if (fill > maxchar)
9032 maxchar = fill;
9033 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009034 if (!u)
9035 return NULL;
9036
9037 kind = PyUnicode_KIND(u);
9038 data = PyUnicode_DATA(u);
9039 if (left)
9040 FILL(kind, data, fill, 0, left);
9041 if (right)
9042 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009043 if (PyUnicode_CopyCharacters(u, left,
9044 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009045 _PyUnicode_LENGTH(self)) < 0)
9046 {
9047 Py_DECREF(u);
9048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 }
9050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054
Alexander Belopolsky40018472011-02-26 01:02:56 +00009055PyObject *
9056PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059
9060 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 switch(PyUnicode_KIND(string)) {
9065 case PyUnicode_1BYTE_KIND:
9066 list = ucs1lib_splitlines(
9067 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9068 PyUnicode_GET_LENGTH(string), keepends);
9069 break;
9070 case PyUnicode_2BYTE_KIND:
9071 list = ucs2lib_splitlines(
9072 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9073 PyUnicode_GET_LENGTH(string), keepends);
9074 break;
9075 case PyUnicode_4BYTE_KIND:
9076 list = ucs4lib_splitlines(
9077 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9078 PyUnicode_GET_LENGTH(string), keepends);
9079 break;
9080 default:
9081 assert(0);
9082 list = 0;
9083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 Py_DECREF(string);
9085 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086}
9087
Alexander Belopolsky40018472011-02-26 01:02:56 +00009088static PyObject *
9089split(PyUnicodeObject *self,
9090 PyUnicodeObject *substring,
9091 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 int kind1, kind2, kind;
9094 void *buf1, *buf2;
9095 Py_ssize_t len1, len2;
9096 PyObject* out;
9097
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009099 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 if (PyUnicode_READY(self) == -1)
9102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (substring == NULL)
9105 switch(PyUnicode_KIND(self)) {
9106 case PyUnicode_1BYTE_KIND:
9107 return ucs1lib_split_whitespace(
9108 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9109 PyUnicode_GET_LENGTH(self), maxcount
9110 );
9111 case PyUnicode_2BYTE_KIND:
9112 return ucs2lib_split_whitespace(
9113 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9114 PyUnicode_GET_LENGTH(self), maxcount
9115 );
9116 case PyUnicode_4BYTE_KIND:
9117 return ucs4lib_split_whitespace(
9118 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9119 PyUnicode_GET_LENGTH(self), maxcount
9120 );
9121 default:
9122 assert(0);
9123 return NULL;
9124 }
9125
9126 if (PyUnicode_READY(substring) == -1)
9127 return NULL;
9128
9129 kind1 = PyUnicode_KIND(self);
9130 kind2 = PyUnicode_KIND(substring);
9131 kind = kind1 > kind2 ? kind1 : kind2;
9132 buf1 = PyUnicode_DATA(self);
9133 buf2 = PyUnicode_DATA(substring);
9134 if (kind1 != kind)
9135 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9136 if (!buf1)
9137 return NULL;
9138 if (kind2 != kind)
9139 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9140 if (!buf2) {
9141 if (kind1 != kind) PyMem_Free(buf1);
9142 return NULL;
9143 }
9144 len1 = PyUnicode_GET_LENGTH(self);
9145 len2 = PyUnicode_GET_LENGTH(substring);
9146
9147 switch(kind) {
9148 case PyUnicode_1BYTE_KIND:
9149 out = ucs1lib_split(
9150 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9151 break;
9152 case PyUnicode_2BYTE_KIND:
9153 out = ucs2lib_split(
9154 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9155 break;
9156 case PyUnicode_4BYTE_KIND:
9157 out = ucs4lib_split(
9158 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9159 break;
9160 default:
9161 out = NULL;
9162 }
9163 if (kind1 != kind)
9164 PyMem_Free(buf1);
9165 if (kind2 != kind)
9166 PyMem_Free(buf2);
9167 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168}
9169
Alexander Belopolsky40018472011-02-26 01:02:56 +00009170static PyObject *
9171rsplit(PyUnicodeObject *self,
9172 PyUnicodeObject *substring,
9173 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 int kind1, kind2, kind;
9176 void *buf1, *buf2;
9177 Py_ssize_t len1, len2;
9178 PyObject* out;
9179
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009180 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009181 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 if (PyUnicode_READY(self) == -1)
9184 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 if (substring == NULL)
9187 switch(PyUnicode_KIND(self)) {
9188 case PyUnicode_1BYTE_KIND:
9189 return ucs1lib_rsplit_whitespace(
9190 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9191 PyUnicode_GET_LENGTH(self), maxcount
9192 );
9193 case PyUnicode_2BYTE_KIND:
9194 return ucs2lib_rsplit_whitespace(
9195 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9196 PyUnicode_GET_LENGTH(self), maxcount
9197 );
9198 case PyUnicode_4BYTE_KIND:
9199 return ucs4lib_rsplit_whitespace(
9200 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9201 PyUnicode_GET_LENGTH(self), maxcount
9202 );
9203 default:
9204 assert(0);
9205 return NULL;
9206 }
9207
9208 if (PyUnicode_READY(substring) == -1)
9209 return NULL;
9210
9211 kind1 = PyUnicode_KIND(self);
9212 kind2 = PyUnicode_KIND(substring);
9213 kind = kind1 > kind2 ? kind1 : kind2;
9214 buf1 = PyUnicode_DATA(self);
9215 buf2 = PyUnicode_DATA(substring);
9216 if (kind1 != kind)
9217 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9218 if (!buf1)
9219 return NULL;
9220 if (kind2 != kind)
9221 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9222 if (!buf2) {
9223 if (kind1 != kind) PyMem_Free(buf1);
9224 return NULL;
9225 }
9226 len1 = PyUnicode_GET_LENGTH(self);
9227 len2 = PyUnicode_GET_LENGTH(substring);
9228
9229 switch(kind) {
9230 case PyUnicode_1BYTE_KIND:
9231 out = ucs1lib_rsplit(
9232 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9233 break;
9234 case PyUnicode_2BYTE_KIND:
9235 out = ucs2lib_rsplit(
9236 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9237 break;
9238 case PyUnicode_4BYTE_KIND:
9239 out = ucs4lib_rsplit(
9240 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9241 break;
9242 default:
9243 out = NULL;
9244 }
9245 if (kind1 != kind)
9246 PyMem_Free(buf1);
9247 if (kind2 != kind)
9248 PyMem_Free(buf2);
9249 return out;
9250}
9251
9252static Py_ssize_t
9253anylib_find(int kind, void *buf1, Py_ssize_t len1,
9254 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9255{
9256 switch(kind) {
9257 case PyUnicode_1BYTE_KIND:
9258 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9259 case PyUnicode_2BYTE_KIND:
9260 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9261 case PyUnicode_4BYTE_KIND:
9262 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9263 }
9264 assert(0);
9265 return -1;
9266}
9267
9268static Py_ssize_t
9269anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9270 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9271{
9272 switch(kind) {
9273 case PyUnicode_1BYTE_KIND:
9274 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9275 case PyUnicode_2BYTE_KIND:
9276 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9277 case PyUnicode_4BYTE_KIND:
9278 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9279 }
9280 assert(0);
9281 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009282}
9283
Alexander Belopolsky40018472011-02-26 01:02:56 +00009284static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285replace(PyObject *self, PyObject *str1,
9286 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 PyObject *u;
9289 char *sbuf = PyUnicode_DATA(self);
9290 char *buf1 = PyUnicode_DATA(str1);
9291 char *buf2 = PyUnicode_DATA(str2);
9292 int srelease = 0, release1 = 0, release2 = 0;
9293 int skind = PyUnicode_KIND(self);
9294 int kind1 = PyUnicode_KIND(str1);
9295 int kind2 = PyUnicode_KIND(str2);
9296 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9297 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9298 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299
9300 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009303 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (skind < kind1)
9306 /* substring too wide to be present */
9307 goto nothing;
9308
9309 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009310 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009311 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009313 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009315 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 Py_UCS4 u1, u2, maxchar;
9317 int mayshrink, rkind;
9318 u1 = PyUnicode_READ_CHAR(str1, 0);
9319 if (!findchar(sbuf, PyUnicode_KIND(self),
9320 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009321 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 u2 = PyUnicode_READ_CHAR(str2, 0);
9323 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9324 /* Replacing u1 with u2 may cause a maxchar reduction in the
9325 result string. */
9326 mayshrink = maxchar > 127;
9327 if (u2 > maxchar) {
9328 maxchar = u2;
9329 mayshrink = 0;
9330 }
9331 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009332 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009334 if (PyUnicode_CopyCharacters(u, 0,
9335 (PyObject*)self, 0, slen) < 0)
9336 {
9337 Py_DECREF(u);
9338 return NULL;
9339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 rkind = PyUnicode_KIND(u);
9341 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9342 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343 if (--maxcount < 0)
9344 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 if (mayshrink) {
9348 PyObject *tmp = u;
9349 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9350 PyUnicode_GET_LENGTH(tmp));
9351 Py_DECREF(tmp);
9352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 int rkind = skind;
9355 char *res;
9356 if (kind1 < rkind) {
9357 /* widen substring */
9358 buf1 = _PyUnicode_AsKind(str1, rkind);
9359 if (!buf1) goto error;
9360 release1 = 1;
9361 }
9362 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009363 if (i < 0)
9364 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 if (rkind > kind2) {
9366 /* widen replacement */
9367 buf2 = _PyUnicode_AsKind(str2, rkind);
9368 if (!buf2) goto error;
9369 release2 = 1;
9370 }
9371 else if (rkind < kind2) {
9372 /* widen self and buf1 */
9373 rkind = kind2;
9374 if (release1) PyMem_Free(buf1);
9375 sbuf = _PyUnicode_AsKind(self, rkind);
9376 if (!sbuf) goto error;
9377 srelease = 1;
9378 buf1 = _PyUnicode_AsKind(str1, rkind);
9379 if (!buf1) goto error;
9380 release1 = 1;
9381 }
9382 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9383 if (!res) {
9384 PyErr_NoMemory();
9385 goto error;
9386 }
9387 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009388 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9390 buf2,
9391 PyUnicode_KIND_SIZE(rkind, len2));
9392 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009393
9394 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9396 slen-i,
9397 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009398 if (i == -1)
9399 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9401 buf2,
9402 PyUnicode_KIND_SIZE(rkind, len2));
9403 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405
9406 u = PyUnicode_FromKindAndData(rkind, res, slen);
9407 PyMem_Free(res);
9408 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 Py_ssize_t n, i, j, ires;
9413 Py_ssize_t product, new_size;
9414 int rkind = skind;
9415 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 if (kind1 < rkind) {
9418 buf1 = _PyUnicode_AsKind(str1, rkind);
9419 if (!buf1) goto error;
9420 release1 = 1;
9421 }
9422 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009423 if (n == 0)
9424 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 if (kind2 < rkind) {
9426 buf2 = _PyUnicode_AsKind(str2, rkind);
9427 if (!buf2) goto error;
9428 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 else if (kind2 > rkind) {
9431 rkind = kind2;
9432 sbuf = _PyUnicode_AsKind(self, rkind);
9433 if (!sbuf) goto error;
9434 srelease = 1;
9435 if (release1) PyMem_Free(buf1);
9436 buf1 = _PyUnicode_AsKind(str1, rkind);
9437 if (!buf1) goto error;
9438 release1 = 1;
9439 }
9440 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9441 PyUnicode_GET_LENGTH(str1))); */
9442 product = n * (len2-len1);
9443 if ((product / (len2-len1)) != n) {
9444 PyErr_SetString(PyExc_OverflowError,
9445 "replace string is too long");
9446 goto error;
9447 }
9448 new_size = slen + product;
9449 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9450 PyErr_SetString(PyExc_OverflowError,
9451 "replace string is too long");
9452 goto error;
9453 }
9454 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9455 if (!res)
9456 goto error;
9457 ires = i = 0;
9458 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009459 while (n-- > 0) {
9460 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 j = anylib_find(rkind,
9462 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9463 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009464 if (j == -1)
9465 break;
9466 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9469 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9470 PyUnicode_KIND_SIZE(rkind, j-i));
9471 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009472 }
9473 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 if (len2 > 0) {
9475 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9476 buf2,
9477 PyUnicode_KIND_SIZE(rkind, len2));
9478 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9485 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9486 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009487 } else {
9488 /* interleave */
9489 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9491 buf2,
9492 PyUnicode_KIND_SIZE(rkind, len2));
9493 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009494 if (--n <= 0)
9495 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9497 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9498 PyUnicode_KIND_SIZE(rkind, 1));
9499 ires++;
9500 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9503 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9504 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009507 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (srelease)
9510 PyMem_FREE(sbuf);
9511 if (release1)
9512 PyMem_FREE(buf1);
9513 if (release2)
9514 PyMem_FREE(buf2);
9515 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009516
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009518 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 if (srelease)
9520 PyMem_FREE(sbuf);
9521 if (release1)
9522 PyMem_FREE(buf1);
9523 if (release2)
9524 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009525 if (PyUnicode_CheckExact(self)) {
9526 Py_INCREF(self);
9527 return (PyObject *) self;
9528 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009529 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 error:
9531 if (srelease && sbuf)
9532 PyMem_FREE(sbuf);
9533 if (release1 && buf1)
9534 PyMem_FREE(buf1);
9535 if (release2 && buf2)
9536 PyMem_FREE(buf2);
9537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538}
9539
9540/* --- Unicode Object Methods --------------------------------------------- */
9541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009542PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544\n\
9545Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009546characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547
9548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009549unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 return fixup(self, fixtitle);
9552}
9553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009554PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556\n\
9557Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009558have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559
9560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009561unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 return fixup(self, fixcapitalize);
9564}
9565
9566#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009567PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569\n\
9570Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009571normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572
9573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009574unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
9576 PyObject *list;
9577 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009578 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580 /* Split into words */
9581 list = split(self, NULL, -1);
9582 if (!list)
9583 return NULL;
9584
9585 /* Capitalize each word */
9586 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9587 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 if (item == NULL)
9590 goto onError;
9591 Py_DECREF(PyList_GET_ITEM(list, i));
9592 PyList_SET_ITEM(list, i, item);
9593 }
9594
9595 /* Join the words to form a new string */
9596 item = PyUnicode_Join(NULL, list);
9597
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599 Py_DECREF(list);
9600 return (PyObject *)item;
9601}
9602#endif
9603
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009604/* Argument converter. Coerces to a single unicode character */
9605
9606static int
9607convert_uc(PyObject *obj, void *addr)
9608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009611
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 uniobj = PyUnicode_FromObject(obj);
9613 if (uniobj == NULL) {
9614 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009616 return 0;
9617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 Py_DECREF(uniobj);
9622 return 0;
9623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009625 Py_DECREF(uniobj);
9626 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009627}
9628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009629PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009630 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009632Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009633done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634
9635static PyObject *
9636unicode_center(PyUnicodeObject *self, PyObject *args)
9637{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009638 Py_ssize_t marg, left;
9639 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 Py_UCS4 fillchar = ' ';
9641
Victor Stinnere9a29352011-10-01 02:14:59 +02009642 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644
Victor Stinnere9a29352011-10-01 02:14:59 +02009645 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 return NULL;
9647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 Py_INCREF(self);
9650 return (PyObject*) self;
9651 }
9652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 left = marg / 2 + (marg & width & 1);
9655
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009656 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Marc-André Lemburge5034372000-08-08 08:04:29 +00009659#if 0
9660
9661/* This code should go into some future Unicode collation support
9662 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009663 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009664
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009665/* speedy UTF-16 code point order comparison */
9666/* gleaned from: */
9667/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9668
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009669static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009670{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009671 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009672 0, 0, 0, 0, 0, 0, 0, 0,
9673 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009674 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009675};
9676
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677static int
9678unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9679{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009680 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009681
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 Py_UNICODE *s1 = str1->str;
9683 Py_UNICODE *s2 = str2->str;
9684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 len1 = str1->_base._base.length;
9686 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009689 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009690
9691 c1 = *s1++;
9692 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009693
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 if (c1 > (1<<11) * 26)
9695 c1 += utf16Fixup[c1>>11];
9696 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009697 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009698 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009699
9700 if (c1 != c2)
9701 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009702
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009703 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 }
9705
9706 return (len1 < len2) ? -1 : (len1 != len2);
9707}
9708
Marc-André Lemburge5034372000-08-08 08:04:29 +00009709#else
9710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711/* This function assumes that str1 and str2 are readied by the caller. */
9712
Marc-André Lemburge5034372000-08-08 08:04:29 +00009713static int
9714unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 int kind1, kind2;
9717 void *data1, *data2;
9718 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 kind1 = PyUnicode_KIND(str1);
9721 kind2 = PyUnicode_KIND(str2);
9722 data1 = PyUnicode_DATA(str1);
9723 data2 = PyUnicode_DATA(str2);
9724 len1 = PyUnicode_GET_LENGTH(str1);
9725 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 for (i = 0; i < len1 && i < len2; ++i) {
9728 Py_UCS4 c1, c2;
9729 c1 = PyUnicode_READ(kind1, data1, i);
9730 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009731
9732 if (c1 != c2)
9733 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009734 }
9735
9736 return (len1 < len2) ? -1 : (len1 != len2);
9737}
9738
9739#endif
9740
Alexander Belopolsky40018472011-02-26 01:02:56 +00009741int
9742PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9745 if (PyUnicode_READY(left) == -1 ||
9746 PyUnicode_READY(right) == -1)
9747 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009748 return unicode_compare((PyUnicodeObject *)left,
9749 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009751 PyErr_Format(PyExc_TypeError,
9752 "Can't compare %.100s and %.100s",
9753 left->ob_type->tp_name,
9754 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755 return -1;
9756}
9757
Martin v. Löwis5b222132007-06-10 09:51:05 +00009758int
9759PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 Py_ssize_t i;
9762 int kind;
9763 void *data;
9764 Py_UCS4 chr;
9765
Victor Stinner910337b2011-10-03 03:20:16 +02009766 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 if (PyUnicode_READY(uni) == -1)
9768 return -1;
9769 kind = PyUnicode_KIND(uni);
9770 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009771 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9773 if (chr != str[i])
9774 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009775 /* This check keeps Python strings that end in '\0' from comparing equal
9776 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009778 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009779 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009780 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009781 return 0;
9782}
9783
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009784
Benjamin Peterson29060642009-01-31 22:14:21 +00009785#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009787
Alexander Belopolsky40018472011-02-26 01:02:56 +00009788PyObject *
9789PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009790{
9791 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009793 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9794 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 if (PyUnicode_READY(left) == -1 ||
9796 PyUnicode_READY(right) == -1)
9797 return NULL;
9798 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9799 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009800 if (op == Py_EQ) {
9801 Py_INCREF(Py_False);
9802 return Py_False;
9803 }
9804 if (op == Py_NE) {
9805 Py_INCREF(Py_True);
9806 return Py_True;
9807 }
9808 }
9809 if (left == right)
9810 result = 0;
9811 else
9812 result = unicode_compare((PyUnicodeObject *)left,
9813 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009814
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009815 /* Convert the return value to a Boolean */
9816 switch (op) {
9817 case Py_EQ:
9818 v = TEST_COND(result == 0);
9819 break;
9820 case Py_NE:
9821 v = TEST_COND(result != 0);
9822 break;
9823 case Py_LE:
9824 v = TEST_COND(result <= 0);
9825 break;
9826 case Py_GE:
9827 v = TEST_COND(result >= 0);
9828 break;
9829 case Py_LT:
9830 v = TEST_COND(result == -1);
9831 break;
9832 case Py_GT:
9833 v = TEST_COND(result == 1);
9834 break;
9835 default:
9836 PyErr_BadArgument();
9837 return NULL;
9838 }
9839 Py_INCREF(v);
9840 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009841 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842
Brian Curtindfc80e32011-08-10 20:28:54 -05009843 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009844}
9845
Alexander Belopolsky40018472011-02-26 01:02:56 +00009846int
9847PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009848{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009849 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 int kind1, kind2, kind;
9851 void *buf1, *buf2;
9852 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009853 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009854
9855 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009856 sub = PyUnicode_FromObject(element);
9857 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 PyErr_Format(PyExc_TypeError,
9859 "'in <string>' requires string as left operand, not %s",
9860 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009861 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (PyUnicode_READY(sub) == -1)
9864 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009865
Thomas Wouters477c8d52006-05-27 19:21:47 +00009866 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009867 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009868 Py_DECREF(sub);
9869 return -1;
9870 }
9871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 kind1 = PyUnicode_KIND(str);
9873 kind2 = PyUnicode_KIND(sub);
9874 kind = kind1 > kind2 ? kind1 : kind2;
9875 buf1 = PyUnicode_DATA(str);
9876 buf2 = PyUnicode_DATA(sub);
9877 if (kind1 != kind)
9878 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9879 if (!buf1) {
9880 Py_DECREF(sub);
9881 return -1;
9882 }
9883 if (kind2 != kind)
9884 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9885 if (!buf2) {
9886 Py_DECREF(sub);
9887 if (kind1 != kind) PyMem_Free(buf1);
9888 return -1;
9889 }
9890 len1 = PyUnicode_GET_LENGTH(str);
9891 len2 = PyUnicode_GET_LENGTH(sub);
9892
9893 switch(kind) {
9894 case PyUnicode_1BYTE_KIND:
9895 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9896 break;
9897 case PyUnicode_2BYTE_KIND:
9898 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9899 break;
9900 case PyUnicode_4BYTE_KIND:
9901 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9902 break;
9903 default:
9904 result = -1;
9905 assert(0);
9906 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907
9908 Py_DECREF(str);
9909 Py_DECREF(sub);
9910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 if (kind1 != kind)
9912 PyMem_Free(buf1);
9913 if (kind2 != kind)
9914 PyMem_Free(buf2);
9915
Guido van Rossum403d68b2000-03-13 15:55:09 +00009916 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009917}
9918
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919/* Concat to string or Unicode object giving a new Unicode object. */
9920
Alexander Belopolsky40018472011-02-26 01:02:56 +00009921PyObject *
9922PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 PyObject *u = NULL, *v = NULL, *w;
9925 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926
9927 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
9935 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009936 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009940 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009941 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943 }
9944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009946 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 w = PyUnicode_New(
9950 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9951 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009954 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9955 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009956 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009957 v, 0,
9958 PyUnicode_GET_LENGTH(v)) < 0)
9959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960 Py_DECREF(u);
9961 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 Py_XDECREF(u);
9966 Py_XDECREF(v);
9967 return NULL;
9968}
9969
Walter Dörwald1ab83302007-05-18 17:15:44 +00009970void
Victor Stinner23e56682011-10-03 03:54:37 +02009971PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009972{
Victor Stinner23e56682011-10-03 03:54:37 +02009973 PyObject *left, *res;
9974
9975 if (p_left == NULL) {
9976 if (!PyErr_Occurred())
9977 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009978 return;
9979 }
Victor Stinner23e56682011-10-03 03:54:37 +02009980 left = *p_left;
9981 if (right == NULL || !PyUnicode_Check(left)) {
9982 if (!PyErr_Occurred())
9983 PyErr_BadInternalCall();
9984 goto error;
9985 }
9986
9987 if (PyUnicode_CheckExact(left) && left != unicode_empty
9988 && PyUnicode_CheckExact(right) && right != unicode_empty
9989 && unicode_resizable(left)
9990 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9991 || _PyUnicode_WSTR(left) != NULL))
9992 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009993 Py_ssize_t left_len, right_len, new_len;
9994#ifdef Py_DEBUG
9995 Py_ssize_t copied;
9996#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009997
Victor Stinner23e56682011-10-03 03:54:37 +02009998 if (PyUnicode_READY(left))
9999 goto error;
10000 if (PyUnicode_READY(right))
10001 goto error;
10002
10003 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
10004 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
10005 {
Victor Stinnerb8038952011-10-03 23:27:56 +020010006 left_len = PyUnicode_GET_LENGTH(left);
10007 right_len = PyUnicode_GET_LENGTH(right);
10008 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +020010009 PyErr_SetString(PyExc_OverflowError,
10010 "strings are too large to concat");
10011 goto error;
10012 }
Victor Stinnerb8038952011-10-03 23:27:56 +020010013 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010014
10015 /* Now we own the last reference to 'left', so we can resize it
10016 * in-place.
10017 */
10018 if (unicode_resize(&left, new_len) != 0) {
10019 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10020 * deallocated so it cannot be put back into
10021 * 'variable'. The MemoryError is raised when there
10022 * is no value in 'variable', which might (very
10023 * remotely) be a cause of incompatibilities.
10024 */
10025 goto error;
10026 }
10027 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +020010028#ifdef Py_DEBUG
10029 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +020010030 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +020010031 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010032 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +020010033#else
10034 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10035#endif
Victor Stinner23e56682011-10-03 03:54:37 +020010036 *p_left = left;
10037 return;
10038 }
10039 }
10040
10041 res = PyUnicode_Concat(left, right);
10042 if (res == NULL)
10043 goto error;
10044 Py_DECREF(left);
10045 *p_left = res;
10046 return;
10047
10048error:
10049 Py_DECREF(*p_left);
10050 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010051}
10052
10053void
10054PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10055{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010056 PyUnicode_Append(pleft, right);
10057 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010058}
10059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010060PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010063Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010064string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010065interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066
10067static PyObject *
10068unicode_count(PyUnicodeObject *self, PyObject *args)
10069{
10070 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010071 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010072 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 int kind1, kind2, kind;
10075 void *buf1, *buf2;
10076 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077
Jesus Ceaac451502011-04-20 17:09:23 +020010078 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10079 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 kind1 = PyUnicode_KIND(self);
10083 kind2 = PyUnicode_KIND(substring);
10084 kind = kind1 > kind2 ? kind1 : kind2;
10085 buf1 = PyUnicode_DATA(self);
10086 buf2 = PyUnicode_DATA(substring);
10087 if (kind1 != kind)
10088 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10089 if (!buf1) {
10090 Py_DECREF(substring);
10091 return NULL;
10092 }
10093 if (kind2 != kind)
10094 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10095 if (!buf2) {
10096 Py_DECREF(substring);
10097 if (kind1 != kind) PyMem_Free(buf1);
10098 return NULL;
10099 }
10100 len1 = PyUnicode_GET_LENGTH(self);
10101 len2 = PyUnicode_GET_LENGTH(substring);
10102
10103 ADJUST_INDICES(start, end, len1);
10104 switch(kind) {
10105 case PyUnicode_1BYTE_KIND:
10106 iresult = ucs1lib_count(
10107 ((Py_UCS1*)buf1) + start, end - start,
10108 buf2, len2, PY_SSIZE_T_MAX
10109 );
10110 break;
10111 case PyUnicode_2BYTE_KIND:
10112 iresult = ucs2lib_count(
10113 ((Py_UCS2*)buf1) + start, end - start,
10114 buf2, len2, PY_SSIZE_T_MAX
10115 );
10116 break;
10117 case PyUnicode_4BYTE_KIND:
10118 iresult = ucs4lib_count(
10119 ((Py_UCS4*)buf1) + start, end - start,
10120 buf2, len2, PY_SSIZE_T_MAX
10121 );
10122 break;
10123 default:
10124 assert(0); iresult = 0;
10125 }
10126
10127 result = PyLong_FromSsize_t(iresult);
10128
10129 if (kind1 != kind)
10130 PyMem_Free(buf1);
10131 if (kind2 != kind)
10132 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
10134 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010135
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136 return result;
10137}
10138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010139PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010140 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010142Encode S using the codec registered for encoding. Default encoding\n\
10143is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010144handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010145a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10146'xmlcharrefreplace' as well as any other name registered with\n\
10147codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
10149static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010150unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010152 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153 char *encoding = NULL;
10154 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010155
Benjamin Peterson308d6372009-09-18 21:42:35 +000010156 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10157 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010159 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010160}
10161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010162PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164\n\
10165Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010166If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
10168static PyObject*
10169unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10170{
10171 Py_UNICODE *e;
10172 Py_UNICODE *p;
10173 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010174 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 PyUnicodeObject *u;
10177 int tabsize = 8;
10178
10179 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10183 return NULL;
10184
Thomas Wouters7e474022000-07-16 12:04:32 +000010185 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010186 i = 0; /* chars up to and including most recent \n or \r */
10187 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10189 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 if (tabsize > 0) {
10192 incr = tabsize - (j % tabsize); /* cannot overflow */
10193 if (j > PY_SSIZE_T_MAX - incr)
10194 goto overflow1;
10195 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010196 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 if (j > PY_SSIZE_T_MAX - 1)
10200 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201 j++;
10202 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 if (i > PY_SSIZE_T_MAX - j)
10204 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010206 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 }
10208 }
10209
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010210 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010212
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 /* Second pass: create output string and fill it */
10214 u = _PyUnicode_New(i + j);
10215 if (!u)
10216 return NULL;
10217
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010218 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 q = _PyUnicode_WSTR(u); /* next output char */
10220 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 if (tabsize > 0) {
10225 i = tabsize - (j % tabsize);
10226 j += i;
10227 while (i--) {
10228 if (q >= qe)
10229 goto overflow2;
10230 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010231 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010233 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010234 else {
10235 if (q >= qe)
10236 goto overflow2;
10237 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010238 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239 if (*p == '\n' || *p == '\r')
10240 j = 0;
10241 }
10242
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010243 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 Py_DECREF(u);
10245 return NULL;
10246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010248
10249 overflow2:
10250 Py_DECREF(u);
10251 overflow1:
10252 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254}
10255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010256PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258\n\
10259Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010260such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261arguments start and end are interpreted as in slice notation.\n\
10262\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010263Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
10265static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267{
Jesus Ceaac451502011-04-20 17:09:23 +020010268 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010269 Py_ssize_t start;
10270 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
Jesus Ceaac451502011-04-20 17:09:23 +020010273 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10274 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (PyUnicode_READY(self) == -1)
10278 return NULL;
10279 if (PyUnicode_READY(substring) == -1)
10280 return NULL;
10281
10282 result = any_find_slice(
10283 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10284 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010285 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
10287 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (result == -2)
10290 return NULL;
10291
Christian Heimes217cfd12007-12-02 14:31:20 +000010292 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293}
10294
10295static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010296unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010298 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10299 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303
Guido van Rossumc2504932007-09-18 19:42:40 +000010304/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010305 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010306static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010307unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308{
Guido van Rossumc2504932007-09-18 19:42:40 +000010309 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010310 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (_PyUnicode_HASH(self) != -1)
10313 return _PyUnicode_HASH(self);
10314 if (PyUnicode_READY(self) == -1)
10315 return -1;
10316 len = PyUnicode_GET_LENGTH(self);
10317
10318 /* The hash function as a macro, gets expanded three times below. */
10319#define HASH(P) \
10320 x = (Py_uhash_t)*P << 7; \
10321 while (--len >= 0) \
10322 x = (1000003*x) ^ (Py_uhash_t)*P++;
10323
10324 switch (PyUnicode_KIND(self)) {
10325 case PyUnicode_1BYTE_KIND: {
10326 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10327 HASH(c);
10328 break;
10329 }
10330 case PyUnicode_2BYTE_KIND: {
10331 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10332 HASH(s);
10333 break;
10334 }
10335 default: {
10336 Py_UCS4 *l;
10337 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10338 "Impossible switch case in unicode_hash");
10339 l = PyUnicode_4BYTE_DATA(self);
10340 HASH(l);
10341 break;
10342 }
10343 }
10344 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10345
Guido van Rossumc2504932007-09-18 19:42:40 +000010346 if (x == -1)
10347 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010349 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010353PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010356Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357
10358static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010361 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010362 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010363 Py_ssize_t start;
10364 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
Jesus Ceaac451502011-04-20 17:09:23 +020010366 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10367 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (PyUnicode_READY(self) == -1)
10371 return NULL;
10372 if (PyUnicode_READY(substring) == -1)
10373 return NULL;
10374
10375 result = any_find_slice(
10376 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10377 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010378 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379
10380 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (result == -2)
10383 return NULL;
10384
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 if (result < 0) {
10386 PyErr_SetString(PyExc_ValueError, "substring not found");
10387 return NULL;
10388 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010389
Christian Heimes217cfd12007-12-02 14:31:20 +000010390 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391}
10392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010393PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010396Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010397at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398
10399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010400unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_ssize_t i, length;
10403 int kind;
10404 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 int cased;
10406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 if (PyUnicode_READY(self) == -1)
10408 return NULL;
10409 length = PyUnicode_GET_LENGTH(self);
10410 kind = PyUnicode_KIND(self);
10411 data = PyUnicode_DATA(self);
10412
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (length == 1)
10415 return PyBool_FromLong(
10416 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010418 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010421
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 for (i = 0; i < length; i++) {
10424 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010425
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10427 return PyBool_FromLong(0);
10428 else if (!cased && Py_UNICODE_ISLOWER(ch))
10429 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010431 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432}
10433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010434PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010437Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010438at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439
10440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010441unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 Py_ssize_t i, length;
10444 int kind;
10445 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446 int cased;
10447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (PyUnicode_READY(self) == -1)
10449 return NULL;
10450 length = PyUnicode_GET_LENGTH(self);
10451 kind = PyUnicode_KIND(self);
10452 data = PyUnicode_DATA(self);
10453
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (length == 1)
10456 return PyBool_FromLong(
10457 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010459 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010462
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 for (i = 0; i < length; i++) {
10465 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010466
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10468 return PyBool_FromLong(0);
10469 else if (!cased && Py_UNICODE_ISUPPER(ch))
10470 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010472 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473}
10474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010476 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010478Return True if S is a titlecased string and there is at least one\n\
10479character in S, i.e. upper- and titlecase characters may only\n\
10480follow uncased characters and lowercase characters only cased ones.\n\
10481Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010484unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 Py_ssize_t i, length;
10487 int kind;
10488 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489 int cased, previous_is_cased;
10490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (PyUnicode_READY(self) == -1)
10492 return NULL;
10493 length = PyUnicode_GET_LENGTH(self);
10494 kind = PyUnicode_KIND(self);
10495 data = PyUnicode_DATA(self);
10496
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (length == 1) {
10499 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10500 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10501 (Py_UNICODE_ISUPPER(ch) != 0));
10502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010504 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010507
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508 cased = 0;
10509 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 for (i = 0; i < length; i++) {
10511 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010512
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10514 if (previous_is_cased)
10515 return PyBool_FromLong(0);
10516 previous_is_cased = 1;
10517 cased = 1;
10518 }
10519 else if (Py_UNICODE_ISLOWER(ch)) {
10520 if (!previous_is_cased)
10521 return PyBool_FromLong(0);
10522 previous_is_cased = 1;
10523 cased = 1;
10524 }
10525 else
10526 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010528 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529}
10530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010534Return True if all characters in S are whitespace\n\
10535and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
10537static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010538unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 Py_ssize_t i, length;
10541 int kind;
10542 void *data;
10543
10544 if (PyUnicode_READY(self) == -1)
10545 return NULL;
10546 length = PyUnicode_GET_LENGTH(self);
10547 kind = PyUnicode_KIND(self);
10548 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (length == 1)
10552 return PyBool_FromLong(
10553 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010555 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 for (i = 0; i < length; i++) {
10560 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010561 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010564 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565}
10566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010567PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010570Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010571and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010572
10573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010574unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 Py_ssize_t i, length;
10577 int kind;
10578 void *data;
10579
10580 if (PyUnicode_READY(self) == -1)
10581 return NULL;
10582 length = PyUnicode_GET_LENGTH(self);
10583 kind = PyUnicode_KIND(self);
10584 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010585
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010586 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (length == 1)
10588 return PyBool_FromLong(
10589 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010590
10591 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 for (i = 0; i < length; i++) {
10596 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010598 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010599 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010600}
10601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010602PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010603 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010604\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010605Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010606and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010607
10608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010609unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 int kind;
10612 void *data;
10613 Py_ssize_t len, i;
10614
10615 if (PyUnicode_READY(self) == -1)
10616 return NULL;
10617
10618 kind = PyUnicode_KIND(self);
10619 data = PyUnicode_DATA(self);
10620 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010621
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010622 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (len == 1) {
10624 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10625 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10626 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010627
10628 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 for (i = 0; i < len; i++) {
10633 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010634 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010636 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010637 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010638}
10639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010640PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010643Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010644False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645
10646static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010647unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 Py_ssize_t i, length;
10650 int kind;
10651 void *data;
10652
10653 if (PyUnicode_READY(self) == -1)
10654 return NULL;
10655 length = PyUnicode_GET_LENGTH(self);
10656 kind = PyUnicode_KIND(self);
10657 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (length == 1)
10661 return PyBool_FromLong(
10662 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010664 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 for (i = 0; i < length; i++) {
10669 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010672 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673}
10674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010675PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010678Return True if all characters in S are digits\n\
10679and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680
10681static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010682unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 Py_ssize_t i, length;
10685 int kind;
10686 void *data;
10687
10688 if (PyUnicode_READY(self) == -1)
10689 return NULL;
10690 length = PyUnicode_GET_LENGTH(self);
10691 kind = PyUnicode_KIND(self);
10692 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (length == 1) {
10696 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10697 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010700 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 for (i = 0; i < length; i++) {
10705 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010708 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709}
10710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010711PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010712 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010714Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010715False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
10717static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010718unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 Py_ssize_t i, length;
10721 int kind;
10722 void *data;
10723
10724 if (PyUnicode_READY(self) == -1)
10725 return NULL;
10726 length = PyUnicode_GET_LENGTH(self);
10727 kind = PyUnicode_KIND(self);
10728 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (length == 1)
10732 return PyBool_FromLong(
10733 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010735 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010737 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 for (i = 0; i < length; i++) {
10740 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010743 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744}
10745
Martin v. Löwis47383402007-08-15 07:32:56 +000010746int
10747PyUnicode_IsIdentifier(PyObject *self)
10748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 int kind;
10750 void *data;
10751 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010752 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (PyUnicode_READY(self) == -1) {
10755 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 }
10758
10759 /* Special case for empty strings */
10760 if (PyUnicode_GET_LENGTH(self) == 0)
10761 return 0;
10762 kind = PyUnicode_KIND(self);
10763 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010764
10765 /* PEP 3131 says that the first character must be in
10766 XID_Start and subsequent characters in XID_Continue,
10767 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010768 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010769 letters, digits, underscore). However, given the current
10770 definition of XID_Start and XID_Continue, it is sufficient
10771 to check just for these, except that _ must be allowed
10772 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010774 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010775 return 0;
10776
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010777 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010780 return 1;
10781}
10782
10783PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010785\n\
10786Return True if S is a valid identifier according\n\
10787to the language definition.");
10788
10789static PyObject*
10790unicode_isidentifier(PyObject *self)
10791{
10792 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10793}
10794
Georg Brandl559e5d72008-06-11 18:37:52 +000010795PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010797\n\
10798Return True if all characters in S are considered\n\
10799printable in repr() or S is empty, False otherwise.");
10800
10801static PyObject*
10802unicode_isprintable(PyObject *self)
10803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 Py_ssize_t i, length;
10805 int kind;
10806 void *data;
10807
10808 if (PyUnicode_READY(self) == -1)
10809 return NULL;
10810 length = PyUnicode_GET_LENGTH(self);
10811 kind = PyUnicode_KIND(self);
10812 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010813
10814 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (length == 1)
10816 return PyBool_FromLong(
10817 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 for (i = 0; i < length; i++) {
10820 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010821 Py_RETURN_FALSE;
10822 }
10823 }
10824 Py_RETURN_TRUE;
10825}
10826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010827PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010828 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829\n\
10830Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010831iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
10833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010834unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010836 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837}
10838
Martin v. Löwis18e16552006-02-15 17:27:45 +000010839static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840unicode_length(PyUnicodeObject *self)
10841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (PyUnicode_READY(self) == -1)
10843 return -1;
10844 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845}
10846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010847PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010850Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010851done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852
10853static PyObject *
10854unicode_ljust(PyUnicodeObject *self, PyObject *args)
10855{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010856 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 Py_UCS4 fillchar = ' ';
10858
10859 if (PyUnicode_READY(self) == -1)
10860 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010861
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010862 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 return NULL;
10864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 Py_INCREF(self);
10867 return (PyObject*) self;
10868 }
10869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871}
10872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010873PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010879unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881 return fixup(self, fixlower);
10882}
10883
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010884#define LEFTSTRIP 0
10885#define RIGHTSTRIP 1
10886#define BOTHSTRIP 2
10887
10888/* Arrays indexed by above */
10889static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10890
10891#define STRIPNAME(i) (stripformat[i]+3)
10892
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010893/* externally visible for str.strip(unicode) */
10894PyObject *
10895_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 void *data;
10898 int kind;
10899 Py_ssize_t i, j, len;
10900 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10903 return NULL;
10904
10905 kind = PyUnicode_KIND(self);
10906 data = PyUnicode_DATA(self);
10907 len = PyUnicode_GET_LENGTH(self);
10908 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10909 PyUnicode_DATA(sepobj),
10910 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911
Benjamin Peterson14339b62009-01-31 16:36:08 +000010912 i = 0;
10913 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 while (i < len &&
10915 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010916 i++;
10917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010918 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010919
Benjamin Peterson14339b62009-01-31 16:36:08 +000010920 j = len;
10921 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010922 do {
10923 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 } while (j >= i &&
10925 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010927 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010928
Victor Stinner12bab6d2011-10-01 01:53:49 +020010929 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930}
10931
10932PyObject*
10933PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10934{
10935 unsigned char *data;
10936 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010937 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938
Victor Stinnerde636f32011-10-01 03:55:54 +020010939 if (PyUnicode_READY(self) == -1)
10940 return NULL;
10941
10942 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10943
Victor Stinner12bab6d2011-10-01 01:53:49 +020010944 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010946 if (PyUnicode_CheckExact(self)) {
10947 Py_INCREF(self);
10948 return self;
10949 }
10950 else
10951 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 }
10953
Victor Stinner12bab6d2011-10-01 01:53:49 +020010954 length = end - start;
10955 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010956 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957
Victor Stinnerde636f32011-10-01 03:55:54 +020010958 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010959 PyErr_SetString(PyExc_IndexError, "string index out of range");
10960 return NULL;
10961 }
10962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 kind = PyUnicode_KIND(self);
10964 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010965 return PyUnicode_FromKindAndData(kind,
10966 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010967 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010971do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 int kind;
10974 void *data;
10975 Py_ssize_t len, i, j;
10976
10977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979
10980 kind = PyUnicode_KIND(self);
10981 data = PyUnicode_DATA(self);
10982 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010983
Benjamin Peterson14339b62009-01-31 16:36:08 +000010984 i = 0;
10985 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010987 i++;
10988 }
10989 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010990
Benjamin Peterson14339b62009-01-31 16:36:08 +000010991 j = len;
10992 if (striptype != LEFTSTRIP) {
10993 do {
10994 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010996 j++;
10997 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010998
Victor Stinner12bab6d2011-10-01 01:53:49 +020010999 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000}
11001
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011002
11003static PyObject *
11004do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11005{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011006 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011007
Benjamin Peterson14339b62009-01-31 16:36:08 +000011008 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11009 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011010
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 if (sep != NULL && sep != Py_None) {
11012 if (PyUnicode_Check(sep))
11013 return _PyUnicode_XStrip(self, striptype, sep);
11014 else {
11015 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 "%s arg must be None or str",
11017 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011018 return NULL;
11019 }
11020 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011021
Benjamin Peterson14339b62009-01-31 16:36:08 +000011022 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011023}
11024
11025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011026PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011028\n\
11029Return a copy of the string S with leading and trailing\n\
11030whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011031If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011032
11033static PyObject *
11034unicode_strip(PyUnicodeObject *self, PyObject *args)
11035{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011036 if (PyTuple_GET_SIZE(args) == 0)
11037 return do_strip(self, BOTHSTRIP); /* Common case */
11038 else
11039 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011040}
11041
11042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011043PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011045\n\
11046Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011047If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011048
11049static PyObject *
11050unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11051{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011052 if (PyTuple_GET_SIZE(args) == 0)
11053 return do_strip(self, LEFTSTRIP); /* Common case */
11054 else
11055 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011056}
11057
11058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011061\n\
11062Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011063If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011064
11065static PyObject *
11066unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011068 if (PyTuple_GET_SIZE(args) == 0)
11069 return do_strip(self, RIGHTSTRIP); /* Common case */
11070 else
11071 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011072}
11073
11074
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011076unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077{
11078 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
Georg Brandl222de0f2009-04-12 12:01:50 +000011081 if (len < 1) {
11082 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011083 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
Tim Peters7a29bd52001-09-12 03:03:31 +000011086 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 /* no repeat, return original string */
11088 Py_INCREF(str);
11089 return (PyObject*) str;
11090 }
Tim Peters8f422462000-09-09 06:13:41 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (PyUnicode_READY(str) == -1)
11093 return NULL;
11094
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011095 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011096 PyErr_SetString(PyExc_OverflowError,
11097 "repeated string is too long");
11098 return NULL;
11099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103 if (!u)
11104 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011105 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 if (PyUnicode_GET_LENGTH(str) == 1) {
11108 const int kind = PyUnicode_KIND(str);
11109 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11110 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011111 if (kind == PyUnicode_1BYTE_KIND)
11112 memset(to, (unsigned char)fill_char, len);
11113 else {
11114 for (n = 0; n < len; ++n)
11115 PyUnicode_WRITE(kind, to, n, fill_char);
11116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 }
11118 else {
11119 /* number of characters copied this far */
11120 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11121 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11122 char *to = (char *) PyUnicode_DATA(u);
11123 Py_MEMCPY(to, PyUnicode_DATA(str),
11124 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 n = (done <= nchars-done) ? done : nchars-done;
11127 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011128 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 }
11131
11132 return (PyObject*) u;
11133}
11134
Alexander Belopolsky40018472011-02-26 01:02:56 +000011135PyObject *
11136PyUnicode_Replace(PyObject *obj,
11137 PyObject *subobj,
11138 PyObject *replobj,
11139 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140{
11141 PyObject *self;
11142 PyObject *str1;
11143 PyObject *str2;
11144 PyObject *result;
11145
11146 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011147 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011150 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 Py_DECREF(self);
11152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 }
11154 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011155 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 Py_DECREF(self);
11157 Py_DECREF(str1);
11158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161 Py_DECREF(self);
11162 Py_DECREF(str1);
11163 Py_DECREF(str2);
11164 return result;
11165}
11166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011167PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011168 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169\n\
11170Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011171old replaced by new. If the optional argument count is\n\
11172given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
11174static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 PyObject *str1;
11178 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011179 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 PyObject *result;
11181
Martin v. Löwis18e16552006-02-15 17:27:45 +000011182 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 str1 = PyUnicode_FromObject(str1);
11187 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11188 return NULL;
11189 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011190 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 Py_DECREF(str1);
11192 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
11195 result = replace(self, str1, str2, maxcount);
11196
11197 Py_DECREF(str1);
11198 Py_DECREF(str2);
11199 return result;
11200}
11201
Alexander Belopolsky40018472011-02-26 01:02:56 +000011202static PyObject *
11203unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011205 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 Py_ssize_t isize;
11207 Py_ssize_t osize, squote, dquote, i, o;
11208 Py_UCS4 max, quote;
11209 int ikind, okind;
11210 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011213 return NULL;
11214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 isize = PyUnicode_GET_LENGTH(unicode);
11216 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 /* Compute length of output, quote characters, and
11219 maximum character */
11220 osize = 2; /* quotes */
11221 max = 127;
11222 squote = dquote = 0;
11223 ikind = PyUnicode_KIND(unicode);
11224 for (i = 0; i < isize; i++) {
11225 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11226 switch (ch) {
11227 case '\'': squote++; osize++; break;
11228 case '"': dquote++; osize++; break;
11229 case '\\': case '\t': case '\r': case '\n':
11230 osize += 2; break;
11231 default:
11232 /* Fast-path ASCII */
11233 if (ch < ' ' || ch == 0x7f)
11234 osize += 4; /* \xHH */
11235 else if (ch < 0x7f)
11236 osize++;
11237 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11238 osize++;
11239 max = ch > max ? ch : max;
11240 }
11241 else if (ch < 0x100)
11242 osize += 4; /* \xHH */
11243 else if (ch < 0x10000)
11244 osize += 6; /* \uHHHH */
11245 else
11246 osize += 10; /* \uHHHHHHHH */
11247 }
11248 }
11249
11250 quote = '\'';
11251 if (squote) {
11252 if (dquote)
11253 /* Both squote and dquote present. Use squote,
11254 and escape them */
11255 osize += squote;
11256 else
11257 quote = '"';
11258 }
11259
11260 repr = PyUnicode_New(osize, max);
11261 if (repr == NULL)
11262 return NULL;
11263 okind = PyUnicode_KIND(repr);
11264 odata = PyUnicode_DATA(repr);
11265
11266 PyUnicode_WRITE(okind, odata, 0, quote);
11267 PyUnicode_WRITE(okind, odata, osize-1, quote);
11268
11269 for (i = 0, o = 1; i < isize; i++) {
11270 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011271
11272 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if ((ch == quote) || (ch == '\\')) {
11274 PyUnicode_WRITE(okind, odata, o++, '\\');
11275 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011276 continue;
11277 }
11278
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011280 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 PyUnicode_WRITE(okind, odata, o++, '\\');
11282 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011283 }
11284 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 PyUnicode_WRITE(okind, odata, o++, '\\');
11286 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011287 }
11288 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 PyUnicode_WRITE(okind, odata, o++, '\\');
11290 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011291 }
11292
11293 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011294 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 PyUnicode_WRITE(okind, odata, o++, '\\');
11296 PyUnicode_WRITE(okind, odata, o++, 'x');
11297 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11298 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011299 }
11300
Georg Brandl559e5d72008-06-11 18:37:52 +000011301 /* Copy ASCII characters as-is */
11302 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011304 }
11305
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011307 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011308 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011309 (categories Z* and C* except ASCII space)
11310 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011312 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 if (ch <= 0xff) {
11314 PyUnicode_WRITE(okind, odata, o++, '\\');
11315 PyUnicode_WRITE(okind, odata, o++, 'x');
11316 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11317 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011318 }
11319 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 else if (ch >= 0x10000) {
11321 PyUnicode_WRITE(okind, odata, o++, '\\');
11322 PyUnicode_WRITE(okind, odata, o++, 'U');
11323 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11324 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11325 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11326 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11327 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11328 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11329 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11330 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011331 }
11332 /* Map 16-bit characters to '\uxxxx' */
11333 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 PyUnicode_WRITE(okind, odata, o++, '\\');
11335 PyUnicode_WRITE(okind, odata, o++, 'u');
11336 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11337 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11338 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11339 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011340 }
11341 }
11342 /* Copy characters as-is */
11343 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011345 }
11346 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011349 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354\n\
11355Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011356such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357arguments start and end are interpreted as in slice notation.\n\
11358\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
11361static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363{
Jesus Ceaac451502011-04-20 17:09:23 +020011364 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011365 Py_ssize_t start;
11366 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011367 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Jesus Ceaac451502011-04-20 17:09:23 +020011369 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11370 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (PyUnicode_READY(self) == -1)
11374 return NULL;
11375 if (PyUnicode_READY(substring) == -1)
11376 return NULL;
11377
11378 result = any_find_slice(
11379 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11380 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011381 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 if (result == -2)
11386 return NULL;
11387
Christian Heimes217cfd12007-12-02 14:31:20 +000011388 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011394Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
Jesus Ceaac451502011-04-20 17:09:23 +020011399 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011400 Py_ssize_t start;
11401 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011402 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
Jesus Ceaac451502011-04-20 17:09:23 +020011404 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11405 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (PyUnicode_READY(self) == -1)
11409 return NULL;
11410 if (PyUnicode_READY(substring) == -1)
11411 return NULL;
11412
11413 result = any_find_slice(
11414 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11415 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011416 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
11418 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (result == -2)
11421 return NULL;
11422
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 if (result < 0) {
11424 PyErr_SetString(PyExc_ValueError, "substring not found");
11425 return NULL;
11426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427
Christian Heimes217cfd12007-12-02 14:31:20 +000011428 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429}
11430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011431PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011434Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011435done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437static PyObject *
11438unicode_rjust(PyUnicodeObject *self, PyObject *args)
11439{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011440 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 Py_UCS4 fillchar = ' ';
11442
Victor Stinnere9a29352011-10-01 02:14:59 +020011443 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011445
Victor Stinnere9a29352011-10-01 02:14:59 +020011446 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 return NULL;
11448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 Py_INCREF(self);
11451 return (PyObject*) self;
11452 }
11453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Alexander Belopolsky40018472011-02-26 01:02:56 +000011457PyObject *
11458PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
11460 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 s = PyUnicode_FromObject(s);
11463 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011464 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 if (sep != NULL) {
11466 sep = PyUnicode_FromObject(sep);
11467 if (sep == NULL) {
11468 Py_DECREF(s);
11469 return NULL;
11470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 }
11472
11473 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11474
11475 Py_DECREF(s);
11476 Py_XDECREF(sep);
11477 return result;
11478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
11483Return a list of the words in S, using sep as the\n\
11484delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011485splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011486whitespace string is a separator and empty strings are\n\
11487removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject*
11490unicode_split(PyUnicodeObject *self, PyObject *args)
11491{
11492 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011493 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
Martin v. Löwis18e16552006-02-15 17:27:45 +000011495 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 return NULL;
11497
11498 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504}
11505
Thomas Wouters477c8d52006-05-27 19:21:47 +000011506PyObject *
11507PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11508{
11509 PyObject* str_obj;
11510 PyObject* sep_obj;
11511 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 int kind1, kind2, kind;
11513 void *buf1 = NULL, *buf2 = NULL;
11514 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515
11516 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011517 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011521 Py_DECREF(str_obj);
11522 return NULL;
11523 }
11524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 kind1 = PyUnicode_KIND(str_in);
11526 kind2 = PyUnicode_KIND(sep_obj);
11527 kind = kind1 > kind2 ? kind1 : kind2;
11528 buf1 = PyUnicode_DATA(str_in);
11529 if (kind1 != kind)
11530 buf1 = _PyUnicode_AsKind(str_in, kind);
11531 if (!buf1)
11532 goto onError;
11533 buf2 = PyUnicode_DATA(sep_obj);
11534 if (kind2 != kind)
11535 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11536 if (!buf2)
11537 goto onError;
11538 len1 = PyUnicode_GET_LENGTH(str_obj);
11539 len2 = PyUnicode_GET_LENGTH(sep_obj);
11540
11541 switch(PyUnicode_KIND(str_in)) {
11542 case PyUnicode_1BYTE_KIND:
11543 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11544 break;
11545 case PyUnicode_2BYTE_KIND:
11546 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11547 break;
11548 case PyUnicode_4BYTE_KIND:
11549 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11550 break;
11551 default:
11552 assert(0);
11553 out = 0;
11554 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011555
11556 Py_DECREF(sep_obj);
11557 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 if (kind1 != kind)
11559 PyMem_Free(buf1);
11560 if (kind2 != kind)
11561 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011562
11563 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 onError:
11565 Py_DECREF(sep_obj);
11566 Py_DECREF(str_obj);
11567 if (kind1 != kind && buf1)
11568 PyMem_Free(buf1);
11569 if (kind2 != kind && buf2)
11570 PyMem_Free(buf2);
11571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011572}
11573
11574
11575PyObject *
11576PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11577{
11578 PyObject* str_obj;
11579 PyObject* sep_obj;
11580 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 int kind1, kind2, kind;
11582 void *buf1 = NULL, *buf2 = NULL;
11583 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011584
11585 str_obj = PyUnicode_FromObject(str_in);
11586 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011588 sep_obj = PyUnicode_FromObject(sep_in);
11589 if (!sep_obj) {
11590 Py_DECREF(str_obj);
11591 return NULL;
11592 }
11593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 kind1 = PyUnicode_KIND(str_in);
11595 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011596 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 buf1 = PyUnicode_DATA(str_in);
11598 if (kind1 != kind)
11599 buf1 = _PyUnicode_AsKind(str_in, kind);
11600 if (!buf1)
11601 goto onError;
11602 buf2 = PyUnicode_DATA(sep_obj);
11603 if (kind2 != kind)
11604 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11605 if (!buf2)
11606 goto onError;
11607 len1 = PyUnicode_GET_LENGTH(str_obj);
11608 len2 = PyUnicode_GET_LENGTH(sep_obj);
11609
11610 switch(PyUnicode_KIND(str_in)) {
11611 case PyUnicode_1BYTE_KIND:
11612 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11613 break;
11614 case PyUnicode_2BYTE_KIND:
11615 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11616 break;
11617 case PyUnicode_4BYTE_KIND:
11618 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11619 break;
11620 default:
11621 assert(0);
11622 out = 0;
11623 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011624
11625 Py_DECREF(sep_obj);
11626 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (kind1 != kind)
11628 PyMem_Free(buf1);
11629 if (kind2 != kind)
11630 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631
11632 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 onError:
11634 Py_DECREF(sep_obj);
11635 Py_DECREF(str_obj);
11636 if (kind1 != kind && buf1)
11637 PyMem_Free(buf1);
11638 if (kind2 != kind && buf2)
11639 PyMem_Free(buf2);
11640 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011641}
11642
11643PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011645\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011646Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011648found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011649
11650static PyObject*
11651unicode_partition(PyUnicodeObject *self, PyObject *separator)
11652{
11653 return PyUnicode_Partition((PyObject *)self, separator);
11654}
11655
11656PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011657 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011658\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011659Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011660the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011661separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662
11663static PyObject*
11664unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11665{
11666 return PyUnicode_RPartition((PyObject *)self, separator);
11667}
11668
Alexander Belopolsky40018472011-02-26 01:02:56 +000011669PyObject *
11670PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011671{
11672 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011673
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011674 s = PyUnicode_FromObject(s);
11675 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 if (sep != NULL) {
11678 sep = PyUnicode_FromObject(sep);
11679 if (sep == NULL) {
11680 Py_DECREF(s);
11681 return NULL;
11682 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011683 }
11684
11685 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11686
11687 Py_DECREF(s);
11688 Py_XDECREF(sep);
11689 return result;
11690}
11691
11692PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011694\n\
11695Return a list of the words in S, using sep as the\n\
11696delimiter string, starting at the end of the string and\n\
11697working to the front. If maxsplit is given, at most maxsplit\n\
11698splits are done. If sep is not specified, any whitespace string\n\
11699is a separator.");
11700
11701static PyObject*
11702unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11703{
11704 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011705 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011706
Martin v. Löwis18e16552006-02-15 17:27:45 +000011707 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011708 return NULL;
11709
11710 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011712 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011714 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011716}
11717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720\n\
11721Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011722Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
11725static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011726unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011728 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011729 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011731 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11732 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 return NULL;
11734
Guido van Rossum86662912000-04-11 15:38:46 +000011735 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
11738static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011739PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740{
Walter Dörwald346737f2007-05-31 10:44:43 +000011741 if (PyUnicode_CheckExact(self)) {
11742 Py_INCREF(self);
11743 return self;
11744 } else
11745 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011746 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751\n\
11752Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011753and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
11755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011756unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 return fixup(self, fixswapcase);
11759}
11760
Georg Brandlceee0772007-11-27 23:48:05 +000011761PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011763\n\
11764Return a translation table usable for str.translate().\n\
11765If there is only one argument, it must be a dictionary mapping Unicode\n\
11766ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011767Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011768If there are two arguments, they must be strings of equal length, and\n\
11769in the resulting dictionary, each character in x will be mapped to the\n\
11770character at the same position in y. If there is a third argument, it\n\
11771must be a string, whose characters will be mapped to None in the result.");
11772
11773static PyObject*
11774unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11775{
11776 PyObject *x, *y = NULL, *z = NULL;
11777 PyObject *new = NULL, *key, *value;
11778 Py_ssize_t i = 0;
11779 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780
Georg Brandlceee0772007-11-27 23:48:05 +000011781 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11782 return NULL;
11783 new = PyDict_New();
11784 if (!new)
11785 return NULL;
11786 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 int x_kind, y_kind, z_kind;
11788 void *x_data, *y_data, *z_data;
11789
Georg Brandlceee0772007-11-27 23:48:05 +000011790 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011791 if (!PyUnicode_Check(x)) {
11792 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11793 "be a string if there is a second argument");
11794 goto err;
11795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011797 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11798 "arguments must have equal length");
11799 goto err;
11800 }
11801 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 x_kind = PyUnicode_KIND(x);
11803 y_kind = PyUnicode_KIND(y);
11804 x_data = PyUnicode_DATA(x);
11805 y_data = PyUnicode_DATA(y);
11806 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11807 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11808 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011809 if (!key || !value)
11810 goto err;
11811 res = PyDict_SetItem(new, key, value);
11812 Py_DECREF(key);
11813 Py_DECREF(value);
11814 if (res < 0)
11815 goto err;
11816 }
11817 /* create entries for deleting chars in z */
11818 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 z_kind = PyUnicode_KIND(z);
11820 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011821 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011823 if (!key)
11824 goto err;
11825 res = PyDict_SetItem(new, key, Py_None);
11826 Py_DECREF(key);
11827 if (res < 0)
11828 goto err;
11829 }
11830 }
11831 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 int kind;
11833 void *data;
11834
Georg Brandlceee0772007-11-27 23:48:05 +000011835 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011836 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011837 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11838 "to maketrans it must be a dict");
11839 goto err;
11840 }
11841 /* copy entries into the new dict, converting string keys to int keys */
11842 while (PyDict_Next(x, &i, &key, &value)) {
11843 if (PyUnicode_Check(key)) {
11844 /* convert string keys to integer keys */
11845 PyObject *newkey;
11846 if (PyUnicode_GET_SIZE(key) != 1) {
11847 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11848 "table must be of length 1");
11849 goto err;
11850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 kind = PyUnicode_KIND(key);
11852 data = PyUnicode_DATA(key);
11853 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011854 if (!newkey)
11855 goto err;
11856 res = PyDict_SetItem(new, newkey, value);
11857 Py_DECREF(newkey);
11858 if (res < 0)
11859 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011860 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011861 /* just keep integer keys */
11862 if (PyDict_SetItem(new, key, value) < 0)
11863 goto err;
11864 } else {
11865 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11866 "be strings or integers");
11867 goto err;
11868 }
11869 }
11870 }
11871 return new;
11872 err:
11873 Py_DECREF(new);
11874 return NULL;
11875}
11876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011877PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879\n\
11880Return a copy of the string S, where all characters have been mapped\n\
11881through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011882Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011883Unmapped characters are left untouched. Characters mapped to None\n\
11884are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011895Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
11897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011898unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 return fixup(self, fixupper);
11901}
11902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011903PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011906Pad a numeric string S with zeros on the left, to fill a field\n\
11907of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
11909static PyObject *
11910unicode_zfill(PyUnicodeObject *self, PyObject *args)
11911{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011912 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011914 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 int kind;
11916 void *data;
11917 Py_UCS4 chr;
11918
11919 if (PyUnicode_READY(self) == -1)
11920 return NULL;
11921
Martin v. Löwis18e16552006-02-15 17:27:45 +000011922 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 return NULL;
11924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011926 if (PyUnicode_CheckExact(self)) {
11927 Py_INCREF(self);
11928 return (PyObject*) self;
11929 }
11930 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011931 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 }
11933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
11936 u = pad(self, fill, 0, '0');
11937
Walter Dörwald068325e2002-04-15 13:36:47 +000011938 if (u == NULL)
11939 return NULL;
11940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 kind = PyUnicode_KIND(u);
11942 data = PyUnicode_DATA(u);
11943 chr = PyUnicode_READ(kind, data, fill);
11944
11945 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 PyUnicode_WRITE(kind, data, 0, chr);
11948 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
11950
11951 return (PyObject*) u;
11952}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
11954#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011955static PyObject *
11956unicode__decimal2ascii(PyObject *self)
11957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011959}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960#endif
11961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011962PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011965Return True if S starts with the specified prefix, False otherwise.\n\
11966With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011967With optional end, stop comparing S at that position.\n\
11968prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
11970static PyObject *
11971unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011974 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011976 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011977 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011978 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
Jesus Ceaac451502011-04-20 17:09:23 +020011980 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011982 if (PyTuple_Check(subobj)) {
11983 Py_ssize_t i;
11984 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11985 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011987 if (substring == NULL)
11988 return NULL;
11989 result = tailmatch(self, substring, start, end, -1);
11990 Py_DECREF(substring);
11991 if (result) {
11992 Py_RETURN_TRUE;
11993 }
11994 }
11995 /* nothing matched */
11996 Py_RETURN_FALSE;
11997 }
11998 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011999 if (substring == NULL) {
12000 if (PyErr_ExceptionMatches(PyExc_TypeError))
12001 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12002 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012004 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012005 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012007 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008}
12009
12010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012011PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012014Return True if S ends with the specified suffix, False otherwise.\n\
12015With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012016With optional end, stop comparing S at that position.\n\
12017suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
12019static PyObject *
12020unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012023 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012025 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012026 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012027 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Jesus Ceaac451502011-04-20 17:09:23 +020012029 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012031 if (PyTuple_Check(subobj)) {
12032 Py_ssize_t i;
12033 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12034 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012036 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012038 result = tailmatch(self, substring, start, end, +1);
12039 Py_DECREF(substring);
12040 if (result) {
12041 Py_RETURN_TRUE;
12042 }
12043 }
12044 Py_RETURN_FALSE;
12045 }
12046 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012047 if (substring == NULL) {
12048 if (PyErr_ExceptionMatches(PyExc_TypeError))
12049 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12050 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012052 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012053 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012055 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056}
12057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012059
12060PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012062\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012063Return a formatted version of S, using substitutions from args and kwargs.\n\
12064The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012065
Eric Smith27bbca62010-11-04 17:06:58 +000012066PyDoc_STRVAR(format_map__doc__,
12067 "S.format_map(mapping) -> str\n\
12068\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012069Return a formatted version of S, using substitutions from mapping.\n\
12070The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012071
Eric Smith4a7d76d2008-05-30 18:10:19 +000012072static PyObject *
12073unicode__format__(PyObject* self, PyObject* args)
12074{
12075 PyObject *format_spec;
12076
12077 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12078 return NULL;
12079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12081 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012082}
12083
Eric Smith8c663262007-08-25 02:26:07 +000012084PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012086\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012087Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012088
12089static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012090unicode__sizeof__(PyUnicodeObject *v)
12091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 Py_ssize_t size;
12093
12094 /* If it's a compact object, account for base structure +
12095 character data. */
12096 if (PyUnicode_IS_COMPACT_ASCII(v))
12097 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12098 else if (PyUnicode_IS_COMPACT(v))
12099 size = sizeof(PyCompactUnicodeObject) +
12100 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12101 else {
12102 /* If it is a two-block object, account for base object, and
12103 for character block if present. */
12104 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012105 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 size += (PyUnicode_GET_LENGTH(v) + 1) *
12107 PyUnicode_CHARACTER_SIZE(v);
12108 }
12109 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012110 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012111 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012113 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012114 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115
12116 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012117}
12118
12119PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012121
12122static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012123unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012124{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012125 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (!copy)
12127 return NULL;
12128 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012129}
12130
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131static PyMethodDef unicode_methods[] = {
12132
12133 /* Order is according to common usage: often used methods should
12134 appear first, since lookup is done sequentially. */
12135
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012136 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012137 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12138 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012139 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012140 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12141 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12142 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12143 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12144 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12145 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12146 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012147 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012148 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12149 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12150 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012152 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12153 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12154 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012156 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012157 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012159 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12160 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12161 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12162 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12163 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12164 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12165 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12166 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12167 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12168 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12169 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12170 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12171 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12172 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012173 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012174 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012175 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012176 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012177 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012178 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012179 {"maketrans", (PyCFunction) unicode_maketrans,
12180 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012181 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012182#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012183 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184#endif
12185
12186#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012187 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012188 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189#endif
12190
Benjamin Peterson14339b62009-01-31 16:36:08 +000012191 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 {NULL, NULL}
12193};
12194
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012195static PyObject *
12196unicode_mod(PyObject *v, PyObject *w)
12197{
Brian Curtindfc80e32011-08-10 20:28:54 -050012198 if (!PyUnicode_Check(v))
12199 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012201}
12202
12203static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012204 0, /*nb_add*/
12205 0, /*nb_subtract*/
12206 0, /*nb_multiply*/
12207 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012208};
12209
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 (lenfunc) unicode_length, /* sq_length */
12212 PyUnicode_Concat, /* sq_concat */
12213 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12214 (ssizeargfunc) unicode_getitem, /* sq_item */
12215 0, /* sq_slice */
12216 0, /* sq_ass_item */
12217 0, /* sq_ass_slice */
12218 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219};
12220
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012221static PyObject*
12222unicode_subscript(PyUnicodeObject* self, PyObject* item)
12223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (PyUnicode_READY(self) == -1)
12225 return NULL;
12226
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012227 if (PyIndex_Check(item)) {
12228 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012229 if (i == -1 && PyErr_Occurred())
12230 return NULL;
12231 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012233 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012234 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012235 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012237 Py_UNICODE* result_buf;
12238 PyObject* result;
12239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012242 return NULL;
12243 }
12244
12245 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 return PyUnicode_New(0, 0);
12247 } else if (start == 0 && step == 1 &&
12248 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012249 PyUnicode_CheckExact(self)) {
12250 Py_INCREF(self);
12251 return (PyObject *)self;
12252 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012253 return PyUnicode_Substring((PyObject*)self,
12254 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012255 } else {
12256 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012257 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12258 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012259
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 if (result_buf == NULL)
12261 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012262
12263 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12264 result_buf[i] = source_buf[cur];
12265 }
Tim Petersced69f82003-09-16 20:30:58 +000012266
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012267 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012268 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012269 return result;
12270 }
12271 } else {
12272 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12273 return NULL;
12274 }
12275}
12276
12277static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 (lenfunc)unicode_length, /* mp_length */
12279 (binaryfunc)unicode_subscript, /* mp_subscript */
12280 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012281};
12282
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284/* Helpers for PyUnicode_Format() */
12285
12286static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012287getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012289 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 (*p_argidx)++;
12292 if (arglen < 0)
12293 return args;
12294 else
12295 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 }
12297 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299 return NULL;
12300}
12301
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012302/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012304static PyObject *
12305formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012307 char *p;
12308 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012310
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 x = PyFloat_AsDouble(v);
12312 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012313 return NULL;
12314
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012317
Eric Smith0923d1d2009-04-16 20:16:10 +000012318 p = PyOS_double_to_string(x, type, prec,
12319 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012320 if (p == NULL)
12321 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012323 PyMem_Free(p);
12324 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325}
12326
Tim Peters38fd5b62000-09-21 05:43:11 +000012327static PyObject*
12328formatlong(PyObject *val, int flags, int prec, int type)
12329{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 char *buf;
12331 int len;
12332 PyObject *str; /* temporary string object. */
12333 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012334
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12336 if (!str)
12337 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 Py_DECREF(str);
12340 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012341}
12342
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012345 size_t buflen,
12346 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012348 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012349 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (PyUnicode_GET_LENGTH(v) == 1) {
12351 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 buf[1] = '\0';
12353 return 1;
12354 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 goto onError;
12356 }
12357 else {
12358 /* Integer input truncated to a character */
12359 long x;
12360 x = PyLong_AsLong(v);
12361 if (x == -1 && PyErr_Occurred())
12362 goto onError;
12363
12364 if (x < 0 || x > 0x10ffff) {
12365 PyErr_SetString(PyExc_OverflowError,
12366 "%c arg not in range(0x110000)");
12367 return -1;
12368 }
12369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012371 buf[1] = '\0';
12372 return 1;
12373 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012374
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012376 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012378 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379}
12380
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012381/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012382 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012383*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012384#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012385
Alexander Belopolsky40018472011-02-26 01:02:56 +000012386PyObject *
12387PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 void *fmt;
12390 int fmtkind;
12391 PyObject *result;
12392 Py_UCS4 *res, *res0;
12393 Py_UCS4 max;
12394 int kind;
12395 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012399
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 PyErr_BadInternalCall();
12402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12405 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 fmt = PyUnicode_DATA(uformat);
12408 fmtkind = PyUnicode_KIND(uformat);
12409 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12410 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411
12412 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12414 if (res0 == NULL) {
12415 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418
12419 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 arglen = PyTuple_Size(args);
12421 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 }
12423 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 arglen = -1;
12425 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012427 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012428 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430
12431 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 if (--rescnt < 0) {
12434 rescnt = fmtcnt + 100;
12435 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12437 if (res0 == NULL){
12438 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 }
12441 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012445 }
12446 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 /* Got a format specifier */
12448 int flags = 0;
12449 Py_ssize_t width = -1;
12450 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 Py_UCS4 c = '\0';
12452 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 int isnumok;
12454 PyObject *v = NULL;
12455 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 void *pbuf;
12457 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 Py_ssize_t len, len1;
12460 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 fmtpos++;
12463 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12464 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 Py_ssize_t keylen;
12466 PyObject *key;
12467 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012468
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 if (dict == NULL) {
12470 PyErr_SetString(PyExc_TypeError,
12471 "format requires a mapping");
12472 goto onError;
12473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 /* Skip over balanced parentheses */
12478 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 if (fmtcnt < 0 || pcount > 0) {
12487 PyErr_SetString(PyExc_ValueError,
12488 "incomplete format key");
12489 goto onError;
12490 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012491 key = PyUnicode_Substring((PyObject*)uformat,
12492 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 if (key == NULL)
12494 goto onError;
12495 if (args_owned) {
12496 Py_DECREF(args);
12497 args_owned = 0;
12498 }
12499 args = PyObject_GetItem(dict, key);
12500 Py_DECREF(key);
12501 if (args == NULL) {
12502 goto onError;
12503 }
12504 args_owned = 1;
12505 arglen = -1;
12506 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012507 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012508 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 case '-': flags |= F_LJUST; continue;
12511 case '+': flags |= F_SIGN; continue;
12512 case ' ': flags |= F_BLANK; continue;
12513 case '#': flags |= F_ALT; continue;
12514 case '0': flags |= F_ZERO; continue;
12515 }
12516 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 if (c == '*') {
12519 v = getnextarg(args, arglen, &argidx);
12520 if (v == NULL)
12521 goto onError;
12522 if (!PyLong_Check(v)) {
12523 PyErr_SetString(PyExc_TypeError,
12524 "* wants int");
12525 goto onError;
12526 }
12527 width = PyLong_AsLong(v);
12528 if (width == -1 && PyErr_Occurred())
12529 goto onError;
12530 if (width < 0) {
12531 flags |= F_LJUST;
12532 width = -width;
12533 }
12534 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 }
12537 else if (c >= '0' && c <= '9') {
12538 width = c - '0';
12539 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 if (c < '0' || c > '9')
12542 break;
12543 if ((width*10) / 10 != width) {
12544 PyErr_SetString(PyExc_ValueError,
12545 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012546 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 }
12548 width = width*10 + (c - '0');
12549 }
12550 }
12551 if (c == '.') {
12552 prec = 0;
12553 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 if (c == '*') {
12556 v = getnextarg(args, arglen, &argidx);
12557 if (v == NULL)
12558 goto onError;
12559 if (!PyLong_Check(v)) {
12560 PyErr_SetString(PyExc_TypeError,
12561 "* wants int");
12562 goto onError;
12563 }
12564 prec = PyLong_AsLong(v);
12565 if (prec == -1 && PyErr_Occurred())
12566 goto onError;
12567 if (prec < 0)
12568 prec = 0;
12569 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 }
12572 else if (c >= '0' && c <= '9') {
12573 prec = c - '0';
12574 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 if (c < '0' || c > '9')
12577 break;
12578 if ((prec*10) / 10 != prec) {
12579 PyErr_SetString(PyExc_ValueError,
12580 "prec too big");
12581 goto onError;
12582 }
12583 prec = prec*10 + (c - '0');
12584 }
12585 }
12586 } /* prec */
12587 if (fmtcnt >= 0) {
12588 if (c == 'h' || c == 'l' || c == 'L') {
12589 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 }
12592 }
12593 if (fmtcnt < 0) {
12594 PyErr_SetString(PyExc_ValueError,
12595 "incomplete format");
12596 goto onError;
12597 }
12598 if (c != '%') {
12599 v = getnextarg(args, arglen, &argidx);
12600 if (v == NULL)
12601 goto onError;
12602 }
12603 sign = 0;
12604 fill = ' ';
12605 switch (c) {
12606
12607 case '%':
12608 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 len = 1;
12613 break;
12614
12615 case 's':
12616 case 'r':
12617 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012618 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 temp = v;
12620 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012621 }
12622 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 if (c == 's')
12624 temp = PyObject_Str(v);
12625 else if (c == 'r')
12626 temp = PyObject_Repr(v);
12627 else
12628 temp = PyObject_ASCII(v);
12629 if (temp == NULL)
12630 goto onError;
12631 if (PyUnicode_Check(temp))
12632 /* nothing to do */;
12633 else {
12634 Py_DECREF(temp);
12635 PyErr_SetString(PyExc_TypeError,
12636 "%s argument has non-string str()");
12637 goto onError;
12638 }
12639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 if (PyUnicode_READY(temp) == -1) {
12641 Py_CLEAR(temp);
12642 goto onError;
12643 }
12644 pbuf = PyUnicode_DATA(temp);
12645 kind = PyUnicode_KIND(temp);
12646 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 if (prec >= 0 && len > prec)
12648 len = prec;
12649 break;
12650
12651 case 'i':
12652 case 'd':
12653 case 'u':
12654 case 'o':
12655 case 'x':
12656 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 isnumok = 0;
12658 if (PyNumber_Check(v)) {
12659 PyObject *iobj=NULL;
12660
12661 if (PyLong_Check(v)) {
12662 iobj = v;
12663 Py_INCREF(iobj);
12664 }
12665 else {
12666 iobj = PyNumber_Long(v);
12667 }
12668 if (iobj!=NULL) {
12669 if (PyLong_Check(iobj)) {
12670 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012671 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 Py_DECREF(iobj);
12673 if (!temp)
12674 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 if (PyUnicode_READY(temp) == -1) {
12676 Py_CLEAR(temp);
12677 goto onError;
12678 }
12679 pbuf = PyUnicode_DATA(temp);
12680 kind = PyUnicode_KIND(temp);
12681 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 sign = 1;
12683 }
12684 else {
12685 Py_DECREF(iobj);
12686 }
12687 }
12688 }
12689 if (!isnumok) {
12690 PyErr_Format(PyExc_TypeError,
12691 "%%%c format: a number is required, "
12692 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12693 goto onError;
12694 }
12695 if (flags & F_ZERO)
12696 fill = '0';
12697 break;
12698
12699 case 'e':
12700 case 'E':
12701 case 'f':
12702 case 'F':
12703 case 'g':
12704 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012705 temp = formatfloat(v, flags, prec, c);
12706 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 if (PyUnicode_READY(temp) == -1) {
12709 Py_CLEAR(temp);
12710 goto onError;
12711 }
12712 pbuf = PyUnicode_DATA(temp);
12713 kind = PyUnicode_KIND(temp);
12714 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 sign = 1;
12716 if (flags & F_ZERO)
12717 fill = '0';
12718 break;
12719
12720 case 'c':
12721 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012723 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 if (len < 0)
12725 goto onError;
12726 break;
12727
12728 default:
12729 PyErr_Format(PyExc_ValueError,
12730 "unsupported format character '%c' (0x%x) "
12731 "at index %zd",
12732 (31<=c && c<=126) ? (char)c : '?',
12733 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 goto onError;
12736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 /* pbuf is initialized here. */
12738 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12741 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12742 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 len--;
12744 }
12745 else if (flags & F_SIGN)
12746 sign = '+';
12747 else if (flags & F_BLANK)
12748 sign = ' ';
12749 else
12750 sign = 0;
12751 }
12752 if (width < len)
12753 width = len;
12754 if (rescnt - (sign != 0) < width) {
12755 reslen -= rescnt;
12756 rescnt = width + fmtcnt + 100;
12757 reslen += rescnt;
12758 if (reslen < 0) {
12759 Py_XDECREF(temp);
12760 PyErr_NoMemory();
12761 goto onError;
12762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12764 if (res0 == 0) {
12765 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 Py_XDECREF(temp);
12767 goto onError;
12768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 }
12771 if (sign) {
12772 if (fill != ' ')
12773 *res++ = sign;
12774 rescnt--;
12775 if (width > len)
12776 width--;
12777 }
12778 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12780 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12783 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 }
12785 rescnt -= 2;
12786 width -= 2;
12787 if (width < 0)
12788 width = 0;
12789 len -= 2;
12790 }
12791 if (width > len && !(flags & F_LJUST)) {
12792 do {
12793 --rescnt;
12794 *res++ = fill;
12795 } while (--width > len);
12796 }
12797 if (fill == ' ') {
12798 if (sign)
12799 *res++ = sign;
12800 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12802 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12803 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12804 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 }
12806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 /* Copy all characters, preserving len */
12808 len1 = len;
12809 while (len1--) {
12810 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12811 rescnt--;
12812 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 while (--width >= len) {
12814 --rescnt;
12815 *res++ = ' ';
12816 }
12817 if (dict && (argidx < arglen) && c != '%') {
12818 PyErr_SetString(PyExc_TypeError,
12819 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012820 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 goto onError;
12822 }
12823 Py_XDECREF(temp);
12824 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825 } /* until end */
12826 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 PyErr_SetString(PyExc_TypeError,
12828 "not all arguments converted during string formatting");
12829 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 }
12831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832
12833 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12834 if (*res > max)
12835 max = *res;
12836 result = PyUnicode_New(reslen - rescnt, max);
12837 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 kind = PyUnicode_KIND(result);
12840 for (res = res0; res < res0+reslen-rescnt; res++)
12841 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12842 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845 }
12846 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 return (PyObject *)result;
12848
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851 Py_DECREF(uformat);
12852 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854 }
12855 return NULL;
12856}
12857
Jeremy Hylton938ace62002-07-17 16:30:39 +000012858static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012859unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12860
Tim Peters6d6c1a32001-08-02 04:15:00 +000012861static PyObject *
12862unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12863{
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 static char *kwlist[] = {"object", "encoding", "errors", 0};
12866 char *encoding = NULL;
12867 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012868
Benjamin Peterson14339b62009-01-31 16:36:08 +000012869 if (type != &PyUnicode_Type)
12870 return unicode_subtype_new(type, args, kwds);
12871 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 return NULL;
12874 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012876 if (encoding == NULL && errors == NULL)
12877 return PyObject_Str(x);
12878 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012880}
12881
Guido van Rossume023fe02001-08-30 03:12:59 +000012882static PyObject *
12883unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12884{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012885 PyUnicodeObject *unicode, *self;
12886 Py_ssize_t length, char_size;
12887 int share_wstr, share_utf8;
12888 unsigned int kind;
12889 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012890
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012892
12893 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12894 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012895 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012896 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012897 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012898 return NULL;
12899
12900 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12901 if (self == NULL) {
12902 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012903 return NULL;
12904 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012905 kind = PyUnicode_KIND(unicode);
12906 length = PyUnicode_GET_LENGTH(unicode);
12907
12908 _PyUnicode_LENGTH(self) = length;
12909 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12910 _PyUnicode_STATE(self).interned = 0;
12911 _PyUnicode_STATE(self).kind = kind;
12912 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012913 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012914 _PyUnicode_STATE(self).ready = 1;
12915 _PyUnicode_WSTR(self) = NULL;
12916 _PyUnicode_UTF8_LENGTH(self) = 0;
12917 _PyUnicode_UTF8(self) = NULL;
12918 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012919 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012920
12921 share_utf8 = 0;
12922 share_wstr = 0;
12923 if (kind == PyUnicode_1BYTE_KIND) {
12924 char_size = 1;
12925 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12926 share_utf8 = 1;
12927 }
12928 else if (kind == PyUnicode_2BYTE_KIND) {
12929 char_size = 2;
12930 if (sizeof(wchar_t) == 2)
12931 share_wstr = 1;
12932 }
12933 else {
12934 assert(kind == PyUnicode_4BYTE_KIND);
12935 char_size = 4;
12936 if (sizeof(wchar_t) == 4)
12937 share_wstr = 1;
12938 }
12939
12940 /* Ensure we won't overflow the length. */
12941 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12942 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012944 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012945 data = PyObject_MALLOC((length + 1) * char_size);
12946 if (data == NULL) {
12947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 goto onError;
12949 }
12950
Victor Stinnerc3c74152011-10-02 20:39:55 +020012951 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012952 if (share_utf8) {
12953 _PyUnicode_UTF8_LENGTH(self) = length;
12954 _PyUnicode_UTF8(self) = data;
12955 }
12956 if (share_wstr) {
12957 _PyUnicode_WSTR_LENGTH(self) = length;
12958 _PyUnicode_WSTR(self) = (wchar_t *)data;
12959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012961 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12962 PyUnicode_KIND_SIZE(kind, length + 1));
12963 Py_DECREF(unicode);
12964 return (PyObject *)self;
12965
12966onError:
12967 Py_DECREF(unicode);
12968 Py_DECREF(self);
12969 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012970}
12971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012972PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012974\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012975Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012976encoding defaults to the current default string encoding.\n\
12977errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012978
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012979static PyObject *unicode_iter(PyObject *seq);
12980
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012982 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 "str", /* tp_name */
12984 sizeof(PyUnicodeObject), /* tp_size */
12985 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012987 (destructor)unicode_dealloc, /* tp_dealloc */
12988 0, /* tp_print */
12989 0, /* tp_getattr */
12990 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012991 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012992 unicode_repr, /* tp_repr */
12993 &unicode_as_number, /* tp_as_number */
12994 &unicode_as_sequence, /* tp_as_sequence */
12995 &unicode_as_mapping, /* tp_as_mapping */
12996 (hashfunc) unicode_hash, /* tp_hash*/
12997 0, /* tp_call*/
12998 (reprfunc) unicode_str, /* tp_str */
12999 PyObject_GenericGetAttr, /* tp_getattro */
13000 0, /* tp_setattro */
13001 0, /* tp_as_buffer */
13002 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 unicode_doc, /* tp_doc */
13005 0, /* tp_traverse */
13006 0, /* tp_clear */
13007 PyUnicode_RichCompare, /* tp_richcompare */
13008 0, /* tp_weaklistoffset */
13009 unicode_iter, /* tp_iter */
13010 0, /* tp_iternext */
13011 unicode_methods, /* tp_methods */
13012 0, /* tp_members */
13013 0, /* tp_getset */
13014 &PyBaseObject_Type, /* tp_base */
13015 0, /* tp_dict */
13016 0, /* tp_descr_get */
13017 0, /* tp_descr_set */
13018 0, /* tp_dictoffset */
13019 0, /* tp_init */
13020 0, /* tp_alloc */
13021 unicode_new, /* tp_new */
13022 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023};
13024
13025/* Initialize the Unicode implementation */
13026
Thomas Wouters78890102000-07-22 19:25:51 +000013027void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013029 int i;
13030
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033 0x000A, /* LINE FEED */
13034 0x000D, /* CARRIAGE RETURN */
13035 0x001C, /* FILE SEPARATOR */
13036 0x001D, /* GROUP SEPARATOR */
13037 0x001E, /* RECORD SEPARATOR */
13038 0x0085, /* NEXT LINE */
13039 0x2028, /* LINE SEPARATOR */
13040 0x2029, /* PARAGRAPH SEPARATOR */
13041 };
13042
Fred Drakee4315f52000-05-09 19:53:39 +000013043 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013044 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013045 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013047
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013048 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013049 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013050 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
13053 /* initialize the linebreak bloom filter */
13054 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013056 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013057
13058 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059}
13060
13061/* Finalize the Unicode implementation */
13062
Christian Heimesa156e092008-02-16 07:38:31 +000013063int
13064PyUnicode_ClearFreeList(void)
13065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013067}
13068
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069void
Thomas Wouters78890102000-07-22 19:25:51 +000013070_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013072 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013074 Py_XDECREF(unicode_empty);
13075 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013076
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013077 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 if (unicode_latin1[i]) {
13079 Py_DECREF(unicode_latin1[i]);
13080 unicode_latin1[i] = NULL;
13081 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013082 }
Christian Heimesa156e092008-02-16 07:38:31 +000013083 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013085
Walter Dörwald16807132007-05-25 13:52:07 +000013086void
13087PyUnicode_InternInPlace(PyObject **p)
13088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13090 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013091#ifdef Py_DEBUG
13092 assert(s != NULL);
13093 assert(_PyUnicode_CHECK(s));
13094#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013095 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013096 return;
13097#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013098 /* If it's a subclass, we don't really know what putting
13099 it in the interned dict might do. */
13100 if (!PyUnicode_CheckExact(s))
13101 return;
13102 if (PyUnicode_CHECK_INTERNED(s))
13103 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013104 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013105 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 return;
13107 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013108 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 if (interned == NULL) {
13110 interned = PyDict_New();
13111 if (interned == NULL) {
13112 PyErr_Clear(); /* Don't leave an exception */
13113 return;
13114 }
13115 }
13116 /* It might be that the GetItem call fails even
13117 though the key is present in the dictionary,
13118 namely when this happens during a stack overflow. */
13119 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013121 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013122
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 if (t) {
13124 Py_INCREF(t);
13125 Py_DECREF(*p);
13126 *p = t;
13127 return;
13128 }
Walter Dörwald16807132007-05-25 13:52:07 +000013129
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 PyThreadState_GET()->recursion_critical = 1;
13131 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13132 PyErr_Clear();
13133 PyThreadState_GET()->recursion_critical = 0;
13134 return;
13135 }
13136 PyThreadState_GET()->recursion_critical = 0;
13137 /* The two references in interned are not counted by refcnt.
13138 The deallocator will take care of this */
13139 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013141}
13142
13143void
13144PyUnicode_InternImmortal(PyObject **p)
13145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013146 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13147
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 PyUnicode_InternInPlace(p);
13149 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 Py_INCREF(*p);
13152 }
Walter Dörwald16807132007-05-25 13:52:07 +000013153}
13154
13155PyObject *
13156PyUnicode_InternFromString(const char *cp)
13157{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 PyObject *s = PyUnicode_FromString(cp);
13159 if (s == NULL)
13160 return NULL;
13161 PyUnicode_InternInPlace(&s);
13162 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013163}
13164
Alexander Belopolsky40018472011-02-26 01:02:56 +000013165void
13166_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 PyObject *keys;
13169 PyUnicodeObject *s;
13170 Py_ssize_t i, n;
13171 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013172
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 if (interned == NULL || !PyDict_Check(interned))
13174 return;
13175 keys = PyDict_Keys(interned);
13176 if (keys == NULL || !PyList_Check(keys)) {
13177 PyErr_Clear();
13178 return;
13179 }
Walter Dörwald16807132007-05-25 13:52:07 +000013180
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13182 detector, interned unicode strings are not forcibly deallocated;
13183 rather, we give them their stolen references back, and then clear
13184 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013185
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 n = PyList_GET_SIZE(keys);
13187 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013189 for (i = 0; i < n; i++) {
13190 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 if (PyUnicode_READY(s) == -1)
13192 fprintf(stderr, "could not ready string\n");
13193 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 case SSTATE_NOT_INTERNED:
13195 /* XXX Shouldn't happen */
13196 break;
13197 case SSTATE_INTERNED_IMMORTAL:
13198 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 break;
13201 case SSTATE_INTERNED_MORTAL:
13202 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013204 break;
13205 default:
13206 Py_FatalError("Inconsistent interned string state.");
13207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 }
13210 fprintf(stderr, "total size of all interned strings: "
13211 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13212 "mortal/immortal\n", mortal_size, immortal_size);
13213 Py_DECREF(keys);
13214 PyDict_Clear(interned);
13215 Py_DECREF(interned);
13216 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013217}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013218
13219
13220/********************* Unicode Iterator **************************/
13221
13222typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 PyObject_HEAD
13224 Py_ssize_t it_index;
13225 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013226} unicodeiterobject;
13227
13228static void
13229unicodeiter_dealloc(unicodeiterobject *it)
13230{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013231 _PyObject_GC_UNTRACK(it);
13232 Py_XDECREF(it->it_seq);
13233 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013234}
13235
13236static int
13237unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013239 Py_VISIT(it->it_seq);
13240 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013241}
13242
13243static PyObject *
13244unicodeiter_next(unicodeiterobject *it)
13245{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013246 PyUnicodeObject *seq;
13247 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013248
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 assert(it != NULL);
13250 seq = it->it_seq;
13251 if (seq == NULL)
13252 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013253 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13256 int kind = PyUnicode_KIND(seq);
13257 void *data = PyUnicode_DATA(seq);
13258 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13259 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013260 if (item != NULL)
13261 ++it->it_index;
13262 return item;
13263 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013264
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265 Py_DECREF(seq);
13266 it->it_seq = NULL;
13267 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013268}
13269
13270static PyObject *
13271unicodeiter_len(unicodeiterobject *it)
13272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 Py_ssize_t len = 0;
13274 if (it->it_seq)
13275 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13276 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013277}
13278
13279PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13280
13281static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013282 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013285};
13286
13287PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13289 "str_iterator", /* tp_name */
13290 sizeof(unicodeiterobject), /* tp_basicsize */
13291 0, /* tp_itemsize */
13292 /* methods */
13293 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13294 0, /* tp_print */
13295 0, /* tp_getattr */
13296 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013297 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 0, /* tp_repr */
13299 0, /* tp_as_number */
13300 0, /* tp_as_sequence */
13301 0, /* tp_as_mapping */
13302 0, /* tp_hash */
13303 0, /* tp_call */
13304 0, /* tp_str */
13305 PyObject_GenericGetAttr, /* tp_getattro */
13306 0, /* tp_setattro */
13307 0, /* tp_as_buffer */
13308 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13309 0, /* tp_doc */
13310 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13311 0, /* tp_clear */
13312 0, /* tp_richcompare */
13313 0, /* tp_weaklistoffset */
13314 PyObject_SelfIter, /* tp_iter */
13315 (iternextfunc)unicodeiter_next, /* tp_iternext */
13316 unicodeiter_methods, /* tp_methods */
13317 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013318};
13319
13320static PyObject *
13321unicode_iter(PyObject *seq)
13322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013323 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013324
Benjamin Peterson14339b62009-01-31 16:36:08 +000013325 if (!PyUnicode_Check(seq)) {
13326 PyErr_BadInternalCall();
13327 return NULL;
13328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 if (PyUnicode_READY(seq) == -1)
13330 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013331 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13332 if (it == NULL)
13333 return NULL;
13334 it->it_index = 0;
13335 Py_INCREF(seq);
13336 it->it_seq = (PyUnicodeObject *)seq;
13337 _PyObject_GC_TRACK(it);
13338 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013339}
13340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341#define UNIOP(x) Py_UNICODE_##x
13342#define UNIOP_t Py_UNICODE
13343#include "uniops.h"
13344#undef UNIOP
13345#undef UNIOP_t
13346#define UNIOP(x) Py_UCS4_##x
13347#define UNIOP_t Py_UCS4
13348#include "uniops.h"
13349#undef UNIOP
13350#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013351
Victor Stinner71133ff2010-09-01 23:43:53 +000013352Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013353PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013354{
13355 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13356 Py_UNICODE *copy;
13357 Py_ssize_t size;
13358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 if (!PyUnicode_Check(unicode)) {
13360 PyErr_BadArgument();
13361 return NULL;
13362 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013363 /* Ensure we won't overflow the size. */
13364 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13365 PyErr_NoMemory();
13366 return NULL;
13367 }
13368 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13369 size *= sizeof(Py_UNICODE);
13370 copy = PyMem_Malloc(size);
13371 if (copy == NULL) {
13372 PyErr_NoMemory();
13373 return NULL;
13374 }
13375 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13376 return copy;
13377}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013378
Georg Brandl66c221e2010-10-14 07:04:07 +000013379/* A _string module, to export formatter_parser and formatter_field_name_split
13380 to the string.Formatter class implemented in Python. */
13381
13382static PyMethodDef _string_methods[] = {
13383 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13384 METH_O, PyDoc_STR("split the argument as a field name")},
13385 {"formatter_parser", (PyCFunction) formatter_parser,
13386 METH_O, PyDoc_STR("parse the argument as a format string")},
13387 {NULL, NULL}
13388};
13389
13390static struct PyModuleDef _string_module = {
13391 PyModuleDef_HEAD_INIT,
13392 "_string",
13393 PyDoc_STR("string helper module"),
13394 0,
13395 _string_methods,
13396 NULL,
13397 NULL,
13398 NULL,
13399 NULL
13400};
13401
13402PyMODINIT_FUNC
13403PyInit__string(void)
13404{
13405 return PyModule_Create(&_string_module);
13406}
13407
13408
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013409#ifdef __cplusplus
13410}
13411#endif