blob: 2ca271f9df9dd9d51770edb2e85121a07d17b028 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
526
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527/* --- Unicode Object ----------------------------------------------------- */
528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200530fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200531
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200532Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
533 Py_ssize_t size, Py_UCS4 ch,
534 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
537
538 switch (kind) {
539 case PyUnicode_1BYTE_KIND:
540 {
541 Py_UCS1 ch1 = (Py_UCS1) ch;
542 if (ch1 == ch)
543 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
544 else
545 return -1;
546 }
547 case PyUnicode_2BYTE_KIND:
548 {
549 Py_UCS2 ch2 = (Py_UCS2) ch;
550 if (ch2 == ch)
551 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
552 else
553 return -1;
554 }
555 case PyUnicode_4BYTE_KIND:
556 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
557 default:
558 assert(0);
559 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561}
562
Victor Stinnerfe226c02011-10-03 03:52:20 +0200563static PyObject*
564resize_compact(PyObject *unicode, Py_ssize_t length)
565{
566 Py_ssize_t char_size;
567 Py_ssize_t struct_size;
568 Py_ssize_t new_size;
569 int share_wstr;
570
571 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200572 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 if (PyUnicode_IS_COMPACT_ASCII(unicode))
574 struct_size = sizeof(PyASCIIObject);
575 else
576 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200577 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578
579 _Py_DEC_REFTOTAL;
580 _Py_ForgetReference(unicode);
581
582 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
583 PyErr_NoMemory();
584 return NULL;
585 }
586 new_size = (struct_size + (length + 1) * char_size);
587
588 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
589 if (unicode == NULL) {
590 PyObject_Del(unicode);
591 PyErr_NoMemory();
592 return NULL;
593 }
594 _Py_NewReference(unicode);
595 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200596 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200598 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
599 _PyUnicode_WSTR_LENGTH(unicode) = length;
600 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
602 length, 0);
603 return unicode;
604}
605
Alexander Belopolsky40018472011-02-26 01:02:56 +0000606static int
Victor Stinner95663112011-10-04 01:03:50 +0200607resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
Victor Stinner95663112011-10-04 01:03:50 +0200609 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200611 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000612
Victor Stinner95663112011-10-04 01:03:50 +0200613 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614
615 if (PyUnicode_IS_READY(unicode)) {
616 Py_ssize_t char_size;
617 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200618 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200619 void *data;
620
621 data = _PyUnicode_DATA_ANY(unicode);
622 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200623 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200624 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
625 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200626 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
627 {
628 PyObject_DEL(_PyUnicode_UTF8(unicode));
629 _PyUnicode_UTF8(unicode) = NULL;
630 _PyUnicode_UTF8_LENGTH(unicode) = 0;
631 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200632
633 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
634 PyErr_NoMemory();
635 return -1;
636 }
637 new_size = (length + 1) * char_size;
638
639 data = (PyObject *)PyObject_REALLOC(data, new_size);
640 if (data == NULL) {
641 PyErr_NoMemory();
642 return -1;
643 }
644 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200645 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200647 _PyUnicode_WSTR_LENGTH(unicode) = length;
648 }
649 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200650 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_UTF8_LENGTH(unicode) = length;
652 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 _PyUnicode_LENGTH(unicode) = length;
654 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200655 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200656 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinner95663112011-10-04 01:03:50 +0200660 assert(_PyUnicode_WSTR(unicode) != NULL);
661
662 /* check for integer overflow */
663 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
664 PyErr_NoMemory();
665 return -1;
666 }
667 wstr = _PyUnicode_WSTR(unicode);
668 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
669 if (!wstr) {
670 PyErr_NoMemory();
671 return -1;
672 }
673 _PyUnicode_WSTR(unicode) = wstr;
674 _PyUnicode_WSTR(unicode)[length] = 0;
675 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200676 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000677 return 0;
678}
679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680static PyObject*
681resize_copy(PyObject *unicode, Py_ssize_t length)
682{
683 Py_ssize_t copy_length;
684 if (PyUnicode_IS_COMPACT(unicode)) {
685 PyObject *copy;
686 assert(PyUnicode_IS_READY(unicode));
687
688 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
689 if (copy == NULL)
690 return NULL;
691
692 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200693 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200695 }
696 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200697 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(_PyUnicode_WSTR(unicode) != NULL);
699 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200700 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 if (w == NULL)
702 return NULL;
703 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
704 copy_length = Py_MIN(copy_length, length);
705 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
706 copy_length);
707 return (PyObject*)w;
708 }
709}
710
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000712 Ux0000 terminated; some code (e.g. new_identifier)
713 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000716 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717
718*/
719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200721static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722#endif
723
Alexander Belopolsky40018472011-02-26 01:02:56 +0000724static PyUnicodeObject *
725_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726{
727 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200728 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 if (length == 0 && unicode_empty != NULL) {
732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200733 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734 }
735
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000736 /* Ensure we won't overflow the size. */
737 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
738 return (PyUnicodeObject *)PyErr_NoMemory();
739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 if (length < 0) {
741 PyErr_SetString(PyExc_SystemError,
742 "Negative size passed to _PyUnicode_New");
743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746#ifdef Py_DEBUG
747 ++unicode_old_new_calls;
748#endif
749
750 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
751 if (unicode == NULL)
752 return NULL;
753 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
754 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
755 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000756 PyErr_NoMemory();
757 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200759
Jeremy Hyltond8082792003-09-16 19:41:39 +0000760 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000761 * the caller fails before initializing str -- unicode_resize()
762 * reads str[0], and the Keep-Alive optimization can keep memory
763 * allocated for str alive across a call to unicode_dealloc(unicode).
764 * We don't want unicode_resize to read uninitialized memory in
765 * that case.
766 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 _PyUnicode_WSTR(unicode)[0] = 0;
768 _PyUnicode_WSTR(unicode)[length] = 0;
769 _PyUnicode_WSTR_LENGTH(unicode) = length;
770 _PyUnicode_HASH(unicode) = -1;
771 _PyUnicode_STATE(unicode).interned = 0;
772 _PyUnicode_STATE(unicode).kind = 0;
773 _PyUnicode_STATE(unicode).compact = 0;
774 _PyUnicode_STATE(unicode).ready = 0;
775 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200776 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200777 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200778 _PyUnicode_UTF8(unicode) = NULL;
779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000781
Benjamin Peterson29060642009-01-31 22:14:21 +0000782 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000783 /* XXX UNREF/NEWREF interface should be more symmetrical */
784 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000785 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000786 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788}
789
Victor Stinnerf42dc442011-10-02 23:33:16 +0200790static const char*
791unicode_kind_name(PyObject *unicode)
792{
Victor Stinner42dfd712011-10-03 14:41:45 +0200793 /* don't check consistency: unicode_kind_name() is called from
794 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795 if (!PyUnicode_IS_COMPACT(unicode))
796 {
797 if (!PyUnicode_IS_READY(unicode))
798 return "wstr";
799 switch(PyUnicode_KIND(unicode))
800 {
801 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200802 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200803 return "legacy ascii";
804 else
805 return "legacy latin1";
806 case PyUnicode_2BYTE_KIND:
807 return "legacy UCS2";
808 case PyUnicode_4BYTE_KIND:
809 return "legacy UCS4";
810 default:
811 return "<legacy invalid kind>";
812 }
813 }
814 assert(PyUnicode_IS_READY(unicode));
815 switch(PyUnicode_KIND(unicode))
816 {
817 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200818 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200819 return "ascii";
820 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200821 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200822 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 default:
827 return "<invalid compact kind>";
828 }
829}
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200832static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833
834/* Functions wrapping macros for use in debugger */
835char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200836 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837}
838
839void *_PyUnicode_compact_data(void *unicode) {
840 return _PyUnicode_COMPACT_DATA(unicode);
841}
842void *_PyUnicode_data(void *unicode){
843 printf("obj %p\n", unicode);
844 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
845 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
846 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
847 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
848 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
849 return PyUnicode_DATA(unicode);
850}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200851
852void
853_PyUnicode_Dump(PyObject *op)
854{
855 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200856 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
857 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
858 void *data;
859 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
860 if (ascii->state.compact)
861 data = (compact + 1);
862 else
863 data = unicode->data.any;
864 if (ascii->wstr == data)
865 printf("shared ");
866 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200867 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200868 printf(" (%zu), ", compact->wstr_length);
869 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
870 printf("shared ");
871 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200872 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200874}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875#endif
876
877PyObject *
878PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
879{
880 PyObject *obj;
881 PyCompactUnicodeObject *unicode;
882 void *data;
883 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200884 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 Py_ssize_t char_size;
886 Py_ssize_t struct_size;
887
888 /* Optimization for empty strings */
889 if (size == 0 && unicode_empty != NULL) {
890 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200891 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 }
893
894#ifdef Py_DEBUG
895 ++unicode_new_new_calls;
896#endif
897
Victor Stinner9e9d6892011-10-04 01:02:02 +0200898 is_ascii = 0;
899 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 struct_size = sizeof(PyCompactUnicodeObject);
901 if (maxchar < 128) {
902 kind_state = PyUnicode_1BYTE_KIND;
903 char_size = 1;
904 is_ascii = 1;
905 struct_size = sizeof(PyASCIIObject);
906 }
907 else if (maxchar < 256) {
908 kind_state = PyUnicode_1BYTE_KIND;
909 char_size = 1;
910 }
911 else if (maxchar < 65536) {
912 kind_state = PyUnicode_2BYTE_KIND;
913 char_size = 2;
914 if (sizeof(wchar_t) == 2)
915 is_sharing = 1;
916 }
917 else {
918 kind_state = PyUnicode_4BYTE_KIND;
919 char_size = 4;
920 if (sizeof(wchar_t) == 4)
921 is_sharing = 1;
922 }
923
924 /* Ensure we won't overflow the size. */
925 if (size < 0) {
926 PyErr_SetString(PyExc_SystemError,
927 "Negative size passed to PyUnicode_New");
928 return NULL;
929 }
930 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
931 return PyErr_NoMemory();
932
933 /* Duplicated allocation code from _PyObject_New() instead of a call to
934 * PyObject_New() so we are able to allocate space for the object and
935 * it's data buffer.
936 */
937 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
938 if (obj == NULL)
939 return PyErr_NoMemory();
940 obj = PyObject_INIT(obj, &PyUnicode_Type);
941 if (obj == NULL)
942 return NULL;
943
944 unicode = (PyCompactUnicodeObject *)obj;
945 if (is_ascii)
946 data = ((PyASCIIObject*)obj) + 1;
947 else
948 data = unicode + 1;
949 _PyUnicode_LENGTH(unicode) = size;
950 _PyUnicode_HASH(unicode) = -1;
951 _PyUnicode_STATE(unicode).interned = 0;
952 _PyUnicode_STATE(unicode).kind = kind_state;
953 _PyUnicode_STATE(unicode).compact = 1;
954 _PyUnicode_STATE(unicode).ready = 1;
955 _PyUnicode_STATE(unicode).ascii = is_ascii;
956 if (is_ascii) {
957 ((char*)data)[size] = 0;
958 _PyUnicode_WSTR(unicode) = NULL;
959 }
960 else if (kind_state == PyUnicode_1BYTE_KIND) {
961 ((char*)data)[size] = 0;
962 _PyUnicode_WSTR(unicode) = NULL;
963 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200965 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 }
967 else {
968 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 if (kind_state == PyUnicode_2BYTE_KIND)
971 ((Py_UCS2*)data)[size] = 0;
972 else /* kind_state == PyUnicode_4BYTE_KIND */
973 ((Py_UCS4*)data)[size] = 0;
974 if (is_sharing) {
975 _PyUnicode_WSTR_LENGTH(unicode) = size;
976 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
977 }
978 else {
979 _PyUnicode_WSTR_LENGTH(unicode) = 0;
980 _PyUnicode_WSTR(unicode) = NULL;
981 }
982 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200983 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 return obj;
985}
986
987#if SIZEOF_WCHAR_T == 2
988/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
989 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200990 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991
992 This function assumes that unicode can hold one more code point than wstr
993 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200994static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
996 PyUnicodeObject *unicode)
997{
998 const wchar_t *iter;
999 Py_UCS4 *ucs4_out;
1000
Victor Stinner910337b2011-10-03 03:20:16 +02001001 assert(unicode != NULL);
1002 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1004 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1005
1006 for (iter = begin; iter < end; ) {
1007 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1008 _PyUnicode_GET_LENGTH(unicode)));
1009 if (*iter >= 0xD800 && *iter <= 0xDBFF
1010 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1011 {
1012 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1013 iter += 2;
1014 }
1015 else {
1016 *ucs4_out++ = *iter;
1017 iter++;
1018 }
1019 }
1020 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1021 _PyUnicode_GET_LENGTH(unicode)));
1022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023}
1024#endif
1025
Victor Stinnercd9950f2011-10-02 00:34:53 +02001026static int
1027_PyUnicode_Dirty(PyObject *unicode)
1028{
Victor Stinner910337b2011-10-03 03:20:16 +02001029 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001031 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001032 "Cannot modify a string having more than 1 reference");
1033 return -1;
1034 }
1035 _PyUnicode_DIRTY(unicode);
1036 return 0;
1037}
1038
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001039static int
1040_copy_characters(PyObject *to, Py_ssize_t to_start,
1041 PyObject *from, Py_ssize_t from_start,
1042 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 unsigned int from_kind, to_kind;
1045 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001046 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001048 assert(PyUnicode_Check(from));
1049 assert(PyUnicode_Check(to));
1050 assert(PyUnicode_IS_READY(from));
1051 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1054 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1055 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001057 if (how_many == 0)
1058 return 0;
1059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001063 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001065#ifdef Py_DEBUG
1066 if (!check_maxchar
1067 && (from_kind > to_kind
1068 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001069 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1071 Py_UCS4 ch;
1072 Py_ssize_t i;
1073 for (i=0; i < how_many; i++) {
1074 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1075 assert(ch <= to_maxchar);
1076 }
1077 }
1078#endif
1079 fast = (from_kind == to_kind);
1080 if (check_maxchar
1081 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1082 {
1083 /* deny latin1 => ascii */
1084 fast = 0;
1085 }
1086
1087 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001088 Py_MEMCPY((char*)to_data + to_kind * to_start,
1089 (char*)from_data + from_kind * from_start,
1090 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001092 else if (from_kind == PyUnicode_1BYTE_KIND
1093 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001094 {
1095 _PyUnicode_CONVERT_BYTES(
1096 Py_UCS1, Py_UCS2,
1097 PyUnicode_1BYTE_DATA(from) + from_start,
1098 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1099 PyUnicode_2BYTE_DATA(to) + to_start
1100 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001103 && to_kind == PyUnicode_4BYTE_KIND)
1104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS4,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_4BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
1112 else if (from_kind == PyUnicode_2BYTE_KIND
1113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS2, Py_UCS4,
1117 PyUnicode_2BYTE_DATA(from) + from_start,
1118 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001122 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 /* check if max_char(from substring) <= max_char(to) */
1124 if (from_kind > to_kind
1125 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001126 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001127 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001128 /* slow path to check for character overflow */
1129 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 Py_ssize_t i;
1132
Victor Stinner56c161a2011-10-06 02:47:11 +02001133#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 for (i=0; i < how_many; i++) {
1135 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001136 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001137 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1138 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001139#else
1140 if (!check_maxchar) {
1141 for (i=0; i < how_many; i++) {
1142 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1143 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1144 }
1145 }
1146 else {
1147 for (i=0; i < how_many; i++) {
1148 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1149 if (ch > to_maxchar)
1150 return 1;
1151 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1152 }
1153 }
1154#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001155 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001157 assert(0 && "inconsistent state");
1158 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 }
1160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 return 0;
1162}
1163
1164static void
1165copy_characters(PyObject *to, Py_ssize_t to_start,
1166 PyObject *from, Py_ssize_t from_start,
1167 Py_ssize_t how_many)
1168{
1169 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1170}
1171
1172Py_ssize_t
1173PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1174 PyObject *from, Py_ssize_t from_start,
1175 Py_ssize_t how_many)
1176{
1177 int err;
1178
1179 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1180 PyErr_BadInternalCall();
1181 return -1;
1182 }
1183
1184 if (PyUnicode_READY(from))
1185 return -1;
1186 if (PyUnicode_READY(to))
1187 return -1;
1188
1189 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1190 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1191 PyErr_Format(PyExc_SystemError,
1192 "Cannot write %zi characters at %zi "
1193 "in a string of %zi characters",
1194 how_many, to_start, PyUnicode_GET_LENGTH(to));
1195 return -1;
1196 }
1197
1198 if (how_many == 0)
1199 return 0;
1200
1201 if (_PyUnicode_Dirty(to))
1202 return -1;
1203
1204 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1205 if (err) {
1206 PyErr_Format(PyExc_SystemError,
1207 "Cannot copy %s characters "
1208 "into a string of %s characters",
1209 unicode_kind_name(from),
1210 unicode_kind_name(to));
1211 return -1;
1212 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214}
1215
Victor Stinner17222162011-09-28 22:15:37 +02001216/* Find the maximum code point and count the number of surrogate pairs so a
1217 correct string length can be computed before converting a string to UCS4.
1218 This function counts single surrogates as a character and not as a pair.
1219
1220 Return 0 on success, or -1 on error. */
1221static int
1222find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1223 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224{
1225 const wchar_t *iter;
1226
Victor Stinnerc53be962011-10-02 21:33:54 +02001227 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 *num_surrogates = 0;
1229 *maxchar = 0;
1230
1231 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001232 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001234#if SIZEOF_WCHAR_T != 2
1235 if (*maxchar >= 0x10000)
1236 return 0;
1237#endif
1238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239#if SIZEOF_WCHAR_T == 2
1240 if (*iter >= 0xD800 && *iter <= 0xDBFF
1241 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1242 {
1243 Py_UCS4 surrogate_val;
1244 surrogate_val = (((iter[0] & 0x3FF)<<10)
1245 | (iter[1] & 0x3FF)) + 0x10000;
1246 ++(*num_surrogates);
1247 if (surrogate_val > *maxchar)
1248 *maxchar = surrogate_val;
1249 iter += 2;
1250 }
1251 else
1252 iter++;
1253#else
1254 iter++;
1255#endif
1256 }
1257 return 0;
1258}
1259
1260#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001261static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262#endif
1263
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001264static int
1265unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001267 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 wchar_t *end;
1269 Py_UCS4 maxchar = 0;
1270 Py_ssize_t num_surrogates;
1271#if SIZEOF_WCHAR_T == 2
1272 Py_ssize_t length_wo_surrogates;
1273#endif
1274
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001275 assert(p_obj != NULL);
1276 unicode = (PyUnicodeObject *)*p_obj;
1277
Georg Brandl7597add2011-10-05 16:36:47 +02001278 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001279 strings were created using _PyObject_New() and where no canonical
1280 representation (the str field) has been set yet aka strings
1281 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001282 assert(_PyUnicode_CHECK(unicode));
1283 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001285 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001286 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001287 /* Actually, it should neither be interned nor be anything else: */
1288 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289
1290#ifdef Py_DEBUG
1291 ++unicode_ready_calls;
1292#endif
1293
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001294#ifdef Py_DEBUG
1295 assert(!replace || Py_REFCNT(unicode) == 1);
1296#else
1297 if (replace && Py_REFCNT(unicode) != 1)
1298 replace = 0;
1299#endif
1300 if (replace) {
1301 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1302 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1303 /* Optimization for empty strings */
1304 if (len == 0) {
1305 Py_INCREF(unicode_empty);
1306 Py_DECREF(*p_obj);
1307 *p_obj = unicode_empty;
1308 return 0;
1309 }
1310 if (len == 1 && wstr[0] < 256) {
1311 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1312 if (latin1_char == NULL)
1313 return -1;
1314 Py_DECREF(*p_obj);
1315 *p_obj = latin1_char;
1316 return 0;
1317 }
1318 }
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001321 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001322 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324
1325 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001326 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1327 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 PyErr_NoMemory();
1329 return -1;
1330 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001331 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 _PyUnicode_WSTR(unicode), end,
1333 PyUnicode_1BYTE_DATA(unicode));
1334 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1335 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1336 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1337 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001338 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001339 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001340 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 }
1342 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001344 _PyUnicode_UTF8(unicode) = NULL;
1345 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 PyObject_FREE(_PyUnicode_WSTR(unicode));
1348 _PyUnicode_WSTR(unicode) = NULL;
1349 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1350 }
1351 /* In this case we might have to convert down from 4-byte native
1352 wchar_t to 2-byte unicode. */
1353 else if (maxchar < 65536) {
1354 assert(num_surrogates == 0 &&
1355 "FindMaxCharAndNumSurrogatePairs() messed up");
1356
Victor Stinner506f5922011-09-28 22:34:18 +02001357#if SIZEOF_WCHAR_T == 2
1358 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001360 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1361 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1362 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001363 _PyUnicode_UTF8(unicode) = NULL;
1364 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001365#else
1366 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001368 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyErr_NoMemory();
1371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 }
Victor Stinner506f5922011-09-28 22:34:18 +02001373 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1374 _PyUnicode_WSTR(unicode), end,
1375 PyUnicode_2BYTE_DATA(unicode));
1376 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1377 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1378 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001379 _PyUnicode_UTF8(unicode) = NULL;
1380 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001381 PyObject_FREE(_PyUnicode_WSTR(unicode));
1382 _PyUnicode_WSTR(unicode) = NULL;
1383 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1384#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1387 else {
1388#if SIZEOF_WCHAR_T == 2
1389 /* in case the native representation is 2-bytes, we need to allocate a
1390 new normalized 4-byte version. */
1391 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1393 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 PyErr_NoMemory();
1395 return -1;
1396 }
1397 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1398 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001399 _PyUnicode_UTF8(unicode) = NULL;
1400 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001401 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1402 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001403 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyObject_FREE(_PyUnicode_WSTR(unicode));
1405 _PyUnicode_WSTR(unicode) = NULL;
1406 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1407#else
1408 assert(num_surrogates == 0);
1409
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 _PyUnicode_UTF8(unicode) = NULL;
1413 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1415#endif
1416 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1417 }
1418 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001419 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 return 0;
1421}
1422
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001423int
1424_PyUnicode_ReadyReplace(PyObject **op)
1425{
1426 return unicode_ready(op, 1);
1427}
1428
1429int
1430_PyUnicode_Ready(PyObject *op)
1431{
1432 return unicode_ready(&op, 0);
1433}
1434
Alexander Belopolsky40018472011-02-26 01:02:56 +00001435static void
1436unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437{
Walter Dörwald16807132007-05-25 13:52:07 +00001438 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 case SSTATE_NOT_INTERNED:
1440 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 case SSTATE_INTERNED_MORTAL:
1443 /* revive dead object temporarily for DelItem */
1444 Py_REFCNT(unicode) = 3;
1445 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1446 Py_FatalError(
1447 "deletion of interned string failed");
1448 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001449
Benjamin Peterson29060642009-01-31 22:14:21 +00001450 case SSTATE_INTERNED_IMMORTAL:
1451 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001452
Benjamin Peterson29060642009-01-31 22:14:21 +00001453 default:
1454 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001455 }
1456
Victor Stinner03490912011-10-03 23:45:12 +02001457 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001459 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461
1462 if (PyUnicode_IS_COMPACT(unicode)) {
1463 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 }
1465 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 if (_PyUnicode_DATA_ANY(unicode))
1467 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470}
1471
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001472#ifdef Py_DEBUG
1473static int
1474unicode_is_singleton(PyObject *unicode)
1475{
1476 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1477 if (unicode == unicode_empty)
1478 return 1;
1479 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1480 {
1481 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1482 if (ch < 256 && unicode_latin1[ch] == unicode)
1483 return 1;
1484 }
1485 return 0;
1486}
1487#endif
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001491{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001492 if (Py_REFCNT(unicode) != 1)
1493 return 0;
1494 if (PyUnicode_CHECK_INTERNED(unicode))
1495 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001496#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001497 /* singleton refcount is greater than 1 */
1498 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001499#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500 return 1;
1501}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001502
Victor Stinnerfe226c02011-10-03 03:52:20 +02001503static int
1504unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1505{
1506 PyObject *unicode;
1507 Py_ssize_t old_length;
1508
1509 assert(p_unicode != NULL);
1510 unicode = *p_unicode;
1511
1512 assert(unicode != NULL);
1513 assert(PyUnicode_Check(unicode));
1514 assert(0 <= length);
1515
Victor Stinner910337b2011-10-03 03:20:16 +02001516 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001517 old_length = PyUnicode_WSTR_LENGTH(unicode);
1518 else
1519 old_length = PyUnicode_GET_LENGTH(unicode);
1520 if (old_length == length)
1521 return 0;
1522
Victor Stinnerfe226c02011-10-03 03:52:20 +02001523 if (!unicode_resizable(unicode)) {
1524 PyObject *copy = resize_copy(unicode, length);
1525 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 Py_DECREF(*p_unicode);
1528 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001530 }
1531
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (PyUnicode_IS_COMPACT(unicode)) {
1533 *p_unicode = resize_compact(unicode, length);
1534 if (*p_unicode == NULL)
1535 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001536 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001538 }
1539 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001540}
1541
Alexander Belopolsky40018472011-02-26 01:02:56 +00001542int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001544{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 PyObject *unicode;
1546 if (p_unicode == NULL) {
1547 PyErr_BadInternalCall();
1548 return -1;
1549 }
1550 unicode = *p_unicode;
1551 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1552 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1553 {
1554 PyErr_BadInternalCall();
1555 return -1;
1556 }
1557 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560static PyObject*
1561get_latin1_char(unsigned char ch)
1562{
Victor Stinnera464fc12011-10-02 20:39:30 +02001563 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001565 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 if (!unicode)
1567 return NULL;
1568 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001569 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 unicode_latin1[ch] = unicode;
1571 }
1572 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001573 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574}
1575
Alexander Belopolsky40018472011-02-26 01:02:56 +00001576PyObject *
1577PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
1579 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 Py_UCS4 maxchar = 0;
1581 Py_ssize_t num_surrogates;
1582
1583 if (u == NULL)
1584 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586 /* If the Unicode data is known at construction time, we can apply
1587 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 /* Optimization for empty strings */
1590 if (size == 0 && unicode_empty != NULL) {
1591 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001592 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593 }
Tim Petersced69f82003-09-16 20:30:58 +00001594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 /* Single character Unicode objects in the Latin-1 range are
1596 shared when using this constructor */
1597 if (size == 1 && *u < 256)
1598 return get_latin1_char((unsigned char)*u);
1599
1600 /* If not empty and not single character, copy the Unicode data
1601 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001602 if (find_maxchar_surrogates(u, u + size,
1603 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 return NULL;
1605
1606 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1607 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 if (!unicode)
1609 return NULL;
1610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 switch (PyUnicode_KIND(unicode)) {
1612 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001613 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1615 break;
1616 case PyUnicode_2BYTE_KIND:
1617#if Py_UNICODE_SIZE == 2
1618 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1619#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001620 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1622#endif
1623 break;
1624 case PyUnicode_4BYTE_KIND:
1625#if SIZEOF_WCHAR_T == 2
1626 /* This is the only case which has to process surrogates, thus
1627 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001628 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629#else
1630 assert(num_surrogates == 0);
1631 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1632#endif
1633 break;
1634 default:
1635 assert(0 && "Impossible state");
1636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001638 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 return (PyObject *)unicode;
1640}
1641
Alexander Belopolsky40018472011-02-26 01:02:56 +00001642PyObject *
1643PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001644{
1645 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001646
Benjamin Peterson14339b62009-01-31 16:36:08 +00001647 if (size < 0) {
1648 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001649 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 return NULL;
1651 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001652
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001653 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001654 some optimizations which share commonly used objects.
1655 Also, this means the input must be UTF-8, so fall back to the
1656 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001657 if (u != NULL) {
1658
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 /* Optimization for empty strings */
1660 if (size == 0 && unicode_empty != NULL) {
1661 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001662 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001664
1665 /* Single characters are shared when using this constructor.
1666 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 if (size == 1 && Py_CHARMASK(*u) < 128)
1668 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001669
1670 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001671 }
1672
Walter Dörwald55507312007-05-18 13:12:10 +00001673 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001674 if (!unicode)
1675 return NULL;
1676
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001677 return (PyObject *)unicode;
1678}
1679
Alexander Belopolsky40018472011-02-26 01:02:56 +00001680PyObject *
1681PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001682{
1683 size_t size = strlen(u);
1684 if (size > PY_SSIZE_T_MAX) {
1685 PyErr_SetString(PyExc_OverflowError, "input too long");
1686 return NULL;
1687 }
1688
1689 return PyUnicode_FromStringAndSize(u, size);
1690}
1691
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001692PyObject *
1693_PyUnicode_FromId(_Py_Identifier *id)
1694{
1695 if (!id->object) {
1696 id->object = PyUnicode_FromString(id->string);
1697 if (!id->object)
1698 return NULL;
1699 PyUnicode_InternInPlace(&id->object);
1700 assert(!id->next);
1701 id->next = static_strings;
1702 static_strings = id;
1703 }
1704 Py_INCREF(id->object);
1705 return id->object;
1706}
1707
1708void
1709_PyUnicode_ClearStaticStrings()
1710{
1711 _Py_Identifier *i;
1712 for (i = static_strings; i; i = i->next) {
1713 Py_DECREF(i->object);
1714 i->object = NULL;
1715 i->next = NULL;
1716 }
1717}
1718
Victor Stinnere57b1c02011-09-28 22:20:48 +02001719static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001720unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001721{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001722 PyObject *res;
1723#ifdef Py_DEBUG
1724 const unsigned char *p;
1725 const unsigned char *end = s + size;
1726 for (p=s; p < end; p++) {
1727 assert(*p < 128);
1728 }
1729#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001730 if (size == 1)
1731 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001732 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001733 if (!res)
1734 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001735 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001736 return res;
1737}
1738
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001739static Py_UCS4
1740kind_maxchar_limit(unsigned int kind)
1741{
1742 switch(kind) {
1743 case PyUnicode_1BYTE_KIND:
1744 return 0x80;
1745 case PyUnicode_2BYTE_KIND:
1746 return 0x100;
1747 case PyUnicode_4BYTE_KIND:
1748 return 0x10000;
1749 default:
1750 assert(0 && "invalid kind");
1751 return 0x10ffff;
1752 }
1753}
1754
Victor Stinner702c7342011-10-05 13:50:52 +02001755static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001756_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001759 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001760
1761 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001762 if (size == 1)
1763 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001764 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 if (!res)
1767 return NULL;
1768 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001769 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001771}
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
1774_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775{
1776 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001777 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001778
1779 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001780 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001781 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001782 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 if (!res)
1785 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001788 else {
1789 _PyUnicode_CONVERT_BYTES(
1790 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1791 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001792 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return res;
1794}
1795
Victor Stinnere57b1c02011-09-28 22:20:48 +02001796static PyObject*
1797_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798{
1799 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001800 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001801
1802 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001803 if (size == 1 && u[0] < 256)
1804 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001805 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 if (!res)
1808 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001809 if (max_char < 256)
1810 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1811 PyUnicode_1BYTE_DATA(res));
1812 else if (max_char < 0x10000)
1813 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1814 PyUnicode_2BYTE_DATA(res));
1815 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001817 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return res;
1819}
1820
1821PyObject*
1822PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1823{
1824 switch(kind) {
1825 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001828 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001831 default:
1832 assert(0 && "invalid kind");
1833 PyErr_SetString(PyExc_SystemError, "invalid kind");
1834 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836}
1837
Victor Stinner25a4b292011-10-06 12:31:55 +02001838/* Ensure that a string uses the most efficient storage, if it is not the
1839 case: create a new string with of the right kind. Write NULL into *p_unicode
1840 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001841static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001842unicode_adjust_maxchar(PyObject **p_unicode)
1843{
1844 PyObject *unicode, *copy;
1845 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001846 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001847 unsigned int kind;
1848
1849 assert(p_unicode != NULL);
1850 unicode = *p_unicode;
1851 assert(PyUnicode_IS_READY(unicode));
1852 if (PyUnicode_IS_ASCII(unicode))
1853 return;
1854
1855 len = PyUnicode_GET_LENGTH(unicode);
1856 kind = PyUnicode_KIND(unicode);
1857 if (kind == PyUnicode_1BYTE_KIND) {
1858 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001859 max_char = ucs1lib_find_max_char(u, u + len);
1860 if (max_char >= 128)
1861 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001862 }
1863 else if (kind == PyUnicode_2BYTE_KIND) {
1864 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001865 max_char = ucs2lib_find_max_char(u, u + len);
1866 if (max_char >= 256)
1867 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001868 }
1869 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001871 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + len);
1873 if (max_char >= 0x10000)
1874 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001875 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 copy = PyUnicode_New(len, max_char);
1877 copy_characters(copy, 0, unicode, 0, len);
1878 Py_DECREF(unicode);
1879 *p_unicode = copy;
1880}
1881
Victor Stinner034f6cf2011-09-30 02:26:44 +02001882PyObject*
1883PyUnicode_Copy(PyObject *unicode)
1884{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001885 Py_ssize_t size;
1886 PyObject *copy;
1887 void *data;
1888
Victor Stinner034f6cf2011-09-30 02:26:44 +02001889 if (!PyUnicode_Check(unicode)) {
1890 PyErr_BadInternalCall();
1891 return NULL;
1892 }
1893 if (PyUnicode_READY(unicode))
1894 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001895
1896 size = PyUnicode_GET_LENGTH(unicode);
1897 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1898 if (!copy)
1899 return NULL;
1900 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1901
1902 data = PyUnicode_DATA(unicode);
1903 switch (PyUnicode_KIND(unicode))
1904 {
1905 case PyUnicode_1BYTE_KIND:
1906 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1907 break;
1908 case PyUnicode_2BYTE_KIND:
1909 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1910 break;
1911 case PyUnicode_4BYTE_KIND:
1912 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1913 break;
1914 default:
1915 assert(0);
1916 break;
1917 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001919 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001920}
1921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922
Victor Stinnerbc603d12011-10-02 01:00:40 +02001923/* Widen Unicode objects to larger buffers. Don't write terminating null
1924 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925
1926void*
1927_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1928{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001929 Py_ssize_t len;
1930 void *result;
1931 unsigned int skind;
1932
1933 if (PyUnicode_READY(s))
1934 return NULL;
1935
1936 len = PyUnicode_GET_LENGTH(s);
1937 skind = PyUnicode_KIND(s);
1938 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001939 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 return NULL;
1941 }
1942 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001943 case PyUnicode_2BYTE_KIND:
1944 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1945 if (!result)
1946 return PyErr_NoMemory();
1947 assert(skind == PyUnicode_1BYTE_KIND);
1948 _PyUnicode_CONVERT_BYTES(
1949 Py_UCS1, Py_UCS2,
1950 PyUnicode_1BYTE_DATA(s),
1951 PyUnicode_1BYTE_DATA(s) + len,
1952 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001954 case PyUnicode_4BYTE_KIND:
1955 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1956 if (!result)
1957 return PyErr_NoMemory();
1958 if (skind == PyUnicode_2BYTE_KIND) {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS4,
1961 PyUnicode_2BYTE_DATA(s),
1962 PyUnicode_2BYTE_DATA(s) + len,
1963 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001965 else {
1966 assert(skind == PyUnicode_1BYTE_KIND);
1967 _PyUnicode_CONVERT_BYTES(
1968 Py_UCS1, Py_UCS4,
1969 PyUnicode_1BYTE_DATA(s),
1970 PyUnicode_1BYTE_DATA(s) + len,
1971 result);
1972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001974 default:
1975 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 }
Victor Stinner01698042011-10-04 00:04:26 +02001977 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return NULL;
1979}
1980
1981static Py_UCS4*
1982as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1983 int copy_null)
1984{
1985 int kind;
1986 void *data;
1987 Py_ssize_t len, targetlen;
1988 if (PyUnicode_READY(string) == -1)
1989 return NULL;
1990 kind = PyUnicode_KIND(string);
1991 data = PyUnicode_DATA(string);
1992 len = PyUnicode_GET_LENGTH(string);
1993 targetlen = len;
1994 if (copy_null)
1995 targetlen++;
1996 if (!target) {
1997 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1998 PyErr_NoMemory();
1999 return NULL;
2000 }
2001 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2002 if (!target) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 }
2007 else {
2008 if (targetsize < targetlen) {
2009 PyErr_Format(PyExc_SystemError,
2010 "string is longer than the buffer");
2011 if (copy_null && 0 < targetsize)
2012 target[0] = 0;
2013 return NULL;
2014 }
2015 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002016 if (kind == PyUnicode_1BYTE_KIND) {
2017 Py_UCS1 *start = (Py_UCS1 *) data;
2018 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 else if (kind == PyUnicode_2BYTE_KIND) {
2021 Py_UCS2 *start = (Py_UCS2 *) data;
2022 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2023 }
2024 else {
2025 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (copy_null)
2029 target[len] = 0;
2030 return target;
2031}
2032
2033Py_UCS4*
2034PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2035 int copy_null)
2036{
2037 if (target == NULL || targetsize < 1) {
2038 PyErr_BadInternalCall();
2039 return NULL;
2040 }
2041 return as_ucs4(string, target, targetsize, copy_null);
2042}
2043
2044Py_UCS4*
2045PyUnicode_AsUCS4Copy(PyObject *string)
2046{
2047 return as_ucs4(string, NULL, 0, 1);
2048}
2049
2050#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002051
Alexander Belopolsky40018472011-02-26 01:02:56 +00002052PyObject *
2053PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002056 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 PyErr_BadInternalCall();
2059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
2061
Martin v. Löwis790465f2008-04-05 20:41:37 +00002062 if (size == -1) {
2063 size = wcslen(w);
2064 }
2065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067}
2068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002070
Walter Dörwald346737f2007-05-31 10:44:43 +00002071static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002072makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2073 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002074{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 *fmt++ = '%';
2076 if (width) {
2077 if (zeropad)
2078 *fmt++ = '0';
2079 fmt += sprintf(fmt, "%d", width);
2080 }
2081 if (precision)
2082 fmt += sprintf(fmt, ".%d", precision);
2083 if (longflag)
2084 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002085 else if (longlongflag) {
2086 /* longlongflag should only ever be nonzero on machines with
2087 HAVE_LONG_LONG defined */
2088#ifdef HAVE_LONG_LONG
2089 char *f = PY_FORMAT_LONG_LONG;
2090 while (*f)
2091 *fmt++ = *f++;
2092#else
2093 /* we shouldn't ever get here */
2094 assert(0);
2095 *fmt++ = 'l';
2096#endif
2097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 else if (size_tflag) {
2099 char *f = PY_FORMAT_SIZE_T;
2100 while (*f)
2101 *fmt++ = *f++;
2102 }
2103 *fmt++ = c;
2104 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002105}
2106
Victor Stinner96865452011-03-01 23:44:09 +00002107/* helper for PyUnicode_FromFormatV() */
2108
2109static const char*
2110parse_format_flags(const char *f,
2111 int *p_width, int *p_precision,
2112 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2113{
2114 int width, precision, longflag, longlongflag, size_tflag;
2115
2116 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2117 f++;
2118 width = 0;
2119 while (Py_ISDIGIT((unsigned)*f))
2120 width = (width*10) + *f++ - '0';
2121 precision = 0;
2122 if (*f == '.') {
2123 f++;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 precision = (precision*10) + *f++ - '0';
2126 if (*f == '%') {
2127 /* "%.3%s" => f points to "3" */
2128 f--;
2129 }
2130 }
2131 if (*f == '\0') {
2132 /* bogus format "%.1" => go backward, f points to "1" */
2133 f--;
2134 }
2135 if (p_width != NULL)
2136 *p_width = width;
2137 if (p_precision != NULL)
2138 *p_precision = precision;
2139
2140 /* Handle %ld, %lu, %lld and %llu. */
2141 longflag = 0;
2142 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002143 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002144
2145 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002146 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002147 longflag = 1;
2148 ++f;
2149 }
2150#ifdef HAVE_LONG_LONG
2151 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002152 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002153 longlongflag = 1;
2154 f += 2;
2155 }
2156#endif
2157 }
2158 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002159 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002160 size_tflag = 1;
2161 ++f;
2162 }
2163 if (p_longflag != NULL)
2164 *p_longflag = longflag;
2165 if (p_longlongflag != NULL)
2166 *p_longlongflag = longlongflag;
2167 if (p_size_tflag != NULL)
2168 *p_size_tflag = size_tflag;
2169 return f;
2170}
2171
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002172/* maximum number of characters required for output of %ld. 21 characters
2173 allows for 64-bit integers (in decimal) and an optional sign. */
2174#define MAX_LONG_CHARS 21
2175/* maximum number of characters required for output of %lld.
2176 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2177 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2178#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2179
Walter Dörwaldd2034312007-05-18 16:29:38 +00002180PyObject *
2181PyUnicode_FromFormatV(const char *format, va_list vargs)
2182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 va_list count;
2184 Py_ssize_t callcount = 0;
2185 PyObject **callresults = NULL;
2186 PyObject **callresult = NULL;
2187 Py_ssize_t n = 0;
2188 int width = 0;
2189 int precision = 0;
2190 int zeropad;
2191 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002192 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002194 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2196 Py_UCS4 argmaxchar;
2197 Py_ssize_t numbersize = 0;
2198 char *numberresults = NULL;
2199 char *numberresult = NULL;
2200 Py_ssize_t i;
2201 int kind;
2202 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002203
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002204 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002205 /* step 1: count the number of %S/%R/%A/%s format specifications
2206 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2207 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002209 * also estimate a upper bound for all the number formats in the string,
2210 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 for (f = format; *f; f++) {
2213 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002214 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2216 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2217 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2218 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002221#ifdef HAVE_LONG_LONG
2222 if (longlongflag) {
2223 if (width < MAX_LONG_LONG_CHARS)
2224 width = MAX_LONG_LONG_CHARS;
2225 }
2226 else
2227#endif
2228 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2229 including sign. Decimal takes the most space. This
2230 isn't enough for octal. If a width is specified we
2231 need more (which we allocate later). */
2232 if (width < MAX_LONG_CHARS)
2233 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234
2235 /* account for the size + '\0' to separate numbers
2236 inside of the numberresults buffer */
2237 numbersize += (width + 1);
2238 }
2239 }
2240 else if ((unsigned char)*f > 127) {
2241 PyErr_Format(PyExc_ValueError,
2242 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2243 "string, got a non-ASCII byte: 0x%02x",
2244 (unsigned char)*f);
2245 return NULL;
2246 }
2247 }
2248 /* step 2: allocate memory for the results of
2249 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2250 if (callcount) {
2251 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2252 if (!callresults) {
2253 PyErr_NoMemory();
2254 return NULL;
2255 }
2256 callresult = callresults;
2257 }
2258 /* step 2.5: allocate memory for the results of formating numbers */
2259 if (numbersize) {
2260 numberresults = PyObject_Malloc(numbersize);
2261 if (!numberresults) {
2262 PyErr_NoMemory();
2263 goto fail;
2264 }
2265 numberresult = numberresults;
2266 }
2267
2268 /* step 3: format numbers and figure out how large a buffer we need */
2269 for (f = format; *f; f++) {
2270 if (*f == '%') {
2271 const char* p;
2272 int longflag;
2273 int longlongflag;
2274 int size_tflag;
2275 int numprinted;
2276
2277 p = f;
2278 zeropad = (f[1] == '0');
2279 f = parse_format_flags(f, &width, &precision,
2280 &longflag, &longlongflag, &size_tflag);
2281 switch (*f) {
2282 case 'c':
2283 {
2284 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002285 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 n++;
2287 break;
2288 }
2289 case '%':
2290 n++;
2291 break;
2292 case 'i':
2293 case 'd':
2294 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2295 width, precision, *f);
2296 if (longflag)
2297 numprinted = sprintf(numberresult, fmt,
2298 va_arg(count, long));
2299#ifdef HAVE_LONG_LONG
2300 else if (longlongflag)
2301 numprinted = sprintf(numberresult, fmt,
2302 va_arg(count, PY_LONG_LONG));
2303#endif
2304 else if (size_tflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, Py_ssize_t));
2307 else
2308 numprinted = sprintf(numberresult, fmt,
2309 va_arg(count, int));
2310 n += numprinted;
2311 /* advance by +1 to skip over the '\0' */
2312 numberresult += (numprinted + 1);
2313 assert(*(numberresult - 1) == '\0');
2314 assert(*(numberresult - 2) != '\0');
2315 assert(numprinted >= 0);
2316 assert(numberresult <= numberresults + numbersize);
2317 break;
2318 case 'u':
2319 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2320 width, precision, 'u');
2321 if (longflag)
2322 numprinted = sprintf(numberresult, fmt,
2323 va_arg(count, unsigned long));
2324#ifdef HAVE_LONG_LONG
2325 else if (longlongflag)
2326 numprinted = sprintf(numberresult, fmt,
2327 va_arg(count, unsigned PY_LONG_LONG));
2328#endif
2329 else if (size_tflag)
2330 numprinted = sprintf(numberresult, fmt,
2331 va_arg(count, size_t));
2332 else
2333 numprinted = sprintf(numberresult, fmt,
2334 va_arg(count, unsigned int));
2335 n += numprinted;
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'x':
2343 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2344 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2345 n += numprinted;
2346 numberresult += (numprinted + 1);
2347 assert(*(numberresult - 1) == '\0');
2348 assert(*(numberresult - 2) != '\0');
2349 assert(numprinted >= 0);
2350 assert(numberresult <= numberresults + numbersize);
2351 break;
2352 case 'p':
2353 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2354 /* %p is ill-defined: ensure leading 0x. */
2355 if (numberresult[1] == 'X')
2356 numberresult[1] = 'x';
2357 else if (numberresult[1] != 'x') {
2358 memmove(numberresult + 2, numberresult,
2359 strlen(numberresult) + 1);
2360 numberresult[0] = '0';
2361 numberresult[1] = 'x';
2362 numprinted += 2;
2363 }
2364 n += numprinted;
2365 numberresult += (numprinted + 1);
2366 assert(*(numberresult - 1) == '\0');
2367 assert(*(numberresult - 2) != '\0');
2368 assert(numprinted >= 0);
2369 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002370 break;
2371 case 's':
2372 {
2373 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002374 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002375 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2376 if (!str)
2377 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 /* since PyUnicode_DecodeUTF8 returns already flexible
2379 unicode objects, there is no need to call ready on them */
2380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002381 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002383 /* Remember the str and switch to the next slot */
2384 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 break;
2386 }
2387 case 'U':
2388 {
2389 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002390 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 if (PyUnicode_READY(obj) == -1)
2392 goto fail;
2393 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002394 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 break;
2397 }
2398 case 'V':
2399 {
2400 PyObject *obj = va_arg(count, PyObject *);
2401 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002402 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002404 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002405 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 if (PyUnicode_READY(obj) == -1)
2407 goto fail;
2408 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002409 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002411 *callresult++ = NULL;
2412 }
2413 else {
2414 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2415 if (!str_obj)
2416 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002417 if (PyUnicode_READY(str_obj)) {
2418 Py_DECREF(str_obj);
2419 goto fail;
2420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002422 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002424 *callresult++ = str_obj;
2425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 break;
2427 }
2428 case 'S':
2429 {
2430 PyObject *obj = va_arg(count, PyObject *);
2431 PyObject *str;
2432 assert(obj);
2433 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002437 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002439 /* Remember the str and switch to the next slot */
2440 *callresult++ = str;
2441 break;
2442 }
2443 case 'R':
2444 {
2445 PyObject *obj = va_arg(count, PyObject *);
2446 PyObject *repr;
2447 assert(obj);
2448 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002452 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 /* Remember the repr and switch to the next slot */
2455 *callresult++ = repr;
2456 break;
2457 }
2458 case 'A':
2459 {
2460 PyObject *obj = va_arg(count, PyObject *);
2461 PyObject *ascii;
2462 assert(obj);
2463 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 /* Remember the repr and switch to the next slot */
2470 *callresult++ = ascii;
2471 break;
2472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 default:
2474 /* if we stumble upon an unknown
2475 formatting code, copy the rest of
2476 the format string to the output
2477 string. (we cannot just skip the
2478 code, since there's no way to know
2479 what's in the argument list) */
2480 n += strlen(p);
2481 goto expand;
2482 }
2483 } else
2484 n++;
2485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 we don't have to resize the string.
2490 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002491 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 if (!string)
2493 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 kind = PyUnicode_KIND(string);
2495 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002501 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002502
2503 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2505 /* checking for == because the last argument could be a empty
2506 string, which causes i to point to end, the assert at the end of
2507 the loop */
2508 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002509
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 switch (*f) {
2511 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002512 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 const int ordinal = va_arg(vargs, int);
2514 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002516 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002517 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 case 'p':
2522 /* unused, since we already have the result */
2523 if (*f == 'p')
2524 (void) va_arg(vargs, void *);
2525 else
2526 (void) va_arg(vargs, int);
2527 /* extract the result from numberresults and append. */
2528 for (; *numberresult; ++i, ++numberresult)
2529 PyUnicode_WRITE(kind, data, i, *numberresult);
2530 /* skip over the separating '\0' */
2531 assert(*numberresult == '\0');
2532 numberresult++;
2533 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 break;
2535 case 's':
2536 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002537 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002539 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 size = PyUnicode_GET_LENGTH(*callresult);
2541 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002542 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 /* We're done with the unicode()/repr() => forget it */
2545 Py_DECREF(*callresult);
2546 /* switch to next unicode()/repr() result */
2547 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 break;
2549 }
2550 case 'U':
2551 {
2552 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 Py_ssize_t size;
2554 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2555 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002556 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 break;
2559 }
2560 case 'V':
2561 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002564 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 size = PyUnicode_GET_LENGTH(obj);
2567 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002568 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(*callresult);
2572 assert(PyUnicode_KIND(*callresult) <=
2573 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002574 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002576 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002578 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
2580 }
2581 case 'S':
2582 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002583 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002585 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 /* unused, since we already have the result */
2587 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002589 copy_characters(string, i, *callresult, 0, size);
2590 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* We're done with the unicode()/repr() => forget it */
2592 Py_DECREF(*callresult);
2593 /* switch to next unicode()/repr() result */
2594 ++callresult;
2595 break;
2596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 for (; *p; ++p, ++i)
2602 PyUnicode_WRITE(kind, data, i, *p);
2603 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 goto end;
2605 }
Victor Stinner1205f272010-09-11 00:54:47 +00002606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 else {
2608 assert(i < PyUnicode_GET_LENGTH(string));
2609 PyUnicode_WRITE(kind, data, i++, *f);
2610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002613
Benjamin Peterson29060642009-01-31 22:14:21 +00002614 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 if (callresults)
2616 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 if (numberresults)
2618 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002619 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (callresults) {
2623 PyObject **callresult2 = callresults;
2624 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002625 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 ++callresult2;
2627 }
2628 PyObject_Free(callresults);
2629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 if (numberresults)
2631 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002633}
2634
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635PyObject *
2636PyUnicode_FromFormat(const char *format, ...)
2637{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 PyObject* ret;
2639 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640
2641#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002643#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 ret = PyUnicode_FromFormatV(format, vargs);
2647 va_end(vargs);
2648 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002649}
2650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651#ifdef HAVE_WCHAR_H
2652
Victor Stinner5593d8a2010-10-02 11:11:27 +00002653/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2654 convert a Unicode object to a wide character string.
2655
Victor Stinnerd88d9832011-09-06 02:00:05 +02002656 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 character) required to convert the unicode object. Ignore size argument.
2658
Victor Stinnerd88d9832011-09-06 02:00:05 +02002659 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002660 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002663unicode_aswidechar(PyUnicodeObject *unicode,
2664 wchar_t *w,
2665 Py_ssize_t size)
2666{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 const wchar_t *wstr;
2669
2670 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2671 if (wstr == NULL)
2672 return -1;
2673
Victor Stinner5593d8a2010-10-02 11:11:27 +00002674 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002675 if (size > res)
2676 size = res + 1;
2677 else
2678 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 return res;
2681 }
2682 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002684}
2685
2686Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002687PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002688 wchar_t *w,
2689 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690{
2691 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 PyErr_BadInternalCall();
2693 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002695 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696}
2697
Victor Stinner137c34c2010-09-29 10:25:54 +00002698wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002699PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002700 Py_ssize_t *size)
2701{
2702 wchar_t* buffer;
2703 Py_ssize_t buflen;
2704
2705 if (unicode == NULL) {
2706 PyErr_BadInternalCall();
2707 return NULL;
2708 }
2709
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002710 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 if (buflen == -1)
2712 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 PyErr_NoMemory();
2715 return NULL;
2716 }
2717
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2719 if (buffer == NULL) {
2720 PyErr_NoMemory();
2721 return NULL;
2722 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002723 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 if (buflen == -1)
2725 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726 if (size != NULL)
2727 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 return buffer;
2729}
2730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
Alexander Belopolsky40018472011-02-26 01:02:56 +00002733PyObject *
2734PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002737 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 PyErr_SetString(PyExc_ValueError,
2739 "chr() arg not in range(0x110000)");
2740 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002741 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (ordinal < 256)
2744 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 v = PyUnicode_New(1, ordinal);
2747 if (v == NULL)
2748 return NULL;
2749 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002750 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002752}
2753
Alexander Belopolsky40018472011-02-26 01:02:56 +00002754PyObject *
2755PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002757 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002758 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002760 if (PyUnicode_READY(obj))
2761 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 Py_INCREF(obj);
2763 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 }
2765 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 /* For a Unicode subtype that's not a Unicode object,
2767 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002768 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002770 PyErr_Format(PyExc_TypeError,
2771 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002772 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002773 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002774}
2775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776PyObject *
2777PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002778 const char *encoding,
2779 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002780{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002781 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002782 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002783
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 PyErr_BadInternalCall();
2786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002788
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002789 /* Decoding bytes objects is the most common case and should be fast */
2790 if (PyBytes_Check(obj)) {
2791 if (PyBytes_GET_SIZE(obj) == 0) {
2792 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002793 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 }
2795 else {
2796 v = PyUnicode_Decode(
2797 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2798 encoding, errors);
2799 }
2800 return v;
2801 }
2802
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 PyErr_SetString(PyExc_TypeError,
2805 "decoding str is not supported");
2806 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002809 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2810 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2811 PyErr_Format(PyExc_TypeError,
2812 "coercing to str: need bytes, bytearray "
2813 "or buffer-like object, %.80s found",
2814 Py_TYPE(obj)->tp_name);
2815 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002816 }
Tim Petersced69f82003-09-16 20:30:58 +00002817
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002820 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002824
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002825 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002826 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827}
2828
Victor Stinner600d3be2010-06-10 12:00:55 +00002829/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002830 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2831 1 on success. */
2832static int
2833normalize_encoding(const char *encoding,
2834 char *lower,
2835 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002837 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002838 char *l;
2839 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002841 e = encoding;
2842 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002844 while (*e) {
2845 if (l == l_end)
2846 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002847 if (Py_ISUPPER(*e)) {
2848 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002849 }
2850 else if (*e == '_') {
2851 *l++ = '-';
2852 e++;
2853 }
2854 else {
2855 *l++ = *e++;
2856 }
2857 }
2858 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002859 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002860}
2861
Alexander Belopolsky40018472011-02-26 01:02:56 +00002862PyObject *
2863PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002864 Py_ssize_t size,
2865 const char *encoding,
2866 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002867{
2868 PyObject *buffer = NULL, *unicode;
2869 Py_buffer info;
2870 char lower[11]; /* Enough for any encoding shortcut */
2871
2872 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002873 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002874
2875 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002876 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002877 if ((strcmp(lower, "utf-8") == 0) ||
2878 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002879 return PyUnicode_DecodeUTF8(s, size, errors);
2880 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002881 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002882 (strcmp(lower, "iso-8859-1") == 0))
2883 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002884#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002885 else if (strcmp(lower, "mbcs") == 0)
2886 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002887#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002888 else if (strcmp(lower, "ascii") == 0)
2889 return PyUnicode_DecodeASCII(s, size, errors);
2890 else if (strcmp(lower, "utf-16") == 0)
2891 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2892 else if (strcmp(lower, "utf-32") == 0)
2893 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895
2896 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002897 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002898 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002899 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002900 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 if (buffer == NULL)
2902 goto onError;
2903 unicode = PyCodec_Decode(buffer, encoding, errors);
2904 if (unicode == NULL)
2905 goto onError;
2906 if (!PyUnicode_Check(unicode)) {
2907 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002908 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002909 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 Py_DECREF(unicode);
2911 goto onError;
2912 }
2913 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002914#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002915 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002916 Py_DECREF(unicode);
2917 return NULL;
2918 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002919#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002920 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002922
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 Py_XDECREF(buffer);
2925 return NULL;
2926}
2927
Alexander Belopolsky40018472011-02-26 01:02:56 +00002928PyObject *
2929PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002930 const char *encoding,
2931 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002932{
2933 PyObject *v;
2934
2935 if (!PyUnicode_Check(unicode)) {
2936 PyErr_BadArgument();
2937 goto onError;
2938 }
2939
2940 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002942
2943 /* Decode via the codec registry */
2944 v = PyCodec_Decode(unicode, encoding, errors);
2945 if (v == NULL)
2946 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002947 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002948 return v;
2949
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002951 return NULL;
2952}
2953
Alexander Belopolsky40018472011-02-26 01:02:56 +00002954PyObject *
2955PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002956 const char *encoding,
2957 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002958{
2959 PyObject *v;
2960
2961 if (!PyUnicode_Check(unicode)) {
2962 PyErr_BadArgument();
2963 goto onError;
2964 }
2965
2966 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002968
2969 /* Decode via the codec registry */
2970 v = PyCodec_Decode(unicode, encoding, errors);
2971 if (v == NULL)
2972 goto onError;
2973 if (!PyUnicode_Check(v)) {
2974 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002975 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002976 Py_TYPE(v)->tp_name);
2977 Py_DECREF(v);
2978 goto onError;
2979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002980 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002981 return v;
2982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002984 return NULL;
2985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 Py_ssize_t size,
2990 const char *encoding,
2991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992{
2993 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 unicode = PyUnicode_FromUnicode(s, size);
2996 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2999 Py_DECREF(unicode);
3000 return v;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003017
3018 /* Encode via the codec registry */
3019 v = PyCodec_Encode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 return v;
3023
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003025 return NULL;
3026}
3027
Victor Stinnerad158722010-10-27 00:25:46 +00003028PyObject *
3029PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003030{
Victor Stinner99b95382011-07-04 14:23:54 +02003031#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003032 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3033 PyUnicode_GET_SIZE(unicode),
3034 NULL);
3035#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003036 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003037#else
Victor Stinner793b5312011-04-27 00:24:21 +02003038 PyInterpreterState *interp = PyThreadState_GET()->interp;
3039 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3040 cannot use it to encode and decode filenames before it is loaded. Load
3041 the Python codec requires to encode at least its own filename. Use the C
3042 version of the locale codec until the codec registry is initialized and
3043 the Python codec is loaded.
3044
3045 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3046 cannot only rely on it: check also interp->fscodec_initialized for
3047 subinterpreters. */
3048 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003049 return PyUnicode_AsEncodedString(unicode,
3050 Py_FileSystemDefaultEncoding,
3051 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003052 }
3053 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003054 /* locale encoding with surrogateescape */
3055 wchar_t *wchar;
3056 char *bytes;
3057 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003058 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003059
3060 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3061 if (wchar == NULL)
3062 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003063 bytes = _Py_wchar2char(wchar, &error_pos);
3064 if (bytes == NULL) {
3065 if (error_pos != (size_t)-1) {
3066 char *errmsg = strerror(errno);
3067 PyObject *exc = NULL;
3068 if (errmsg == NULL)
3069 errmsg = "Py_wchar2char() failed";
3070 raise_encode_exception(&exc,
3071 "filesystemencoding",
3072 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3073 error_pos, error_pos+1,
3074 errmsg);
3075 Py_XDECREF(exc);
3076 }
3077 else
3078 PyErr_NoMemory();
3079 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003080 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003081 }
3082 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003083
3084 bytes_obj = PyBytes_FromString(bytes);
3085 PyMem_Free(bytes);
3086 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003087 }
Victor Stinnerad158722010-10-27 00:25:46 +00003088#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091PyObject *
3092PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003093 const char *encoding,
3094 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
3096 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003097 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003098
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Fred Drakee4315f52000-05-09 19:53:39 +00003103
Victor Stinner2f283c22011-03-02 01:21:46 +00003104 if (encoding == NULL) {
3105 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003107 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003109 }
Fred Drakee4315f52000-05-09 19:53:39 +00003110
3111 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003112 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003113 if ((strcmp(lower, "utf-8") == 0) ||
3114 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003115 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003116 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003118 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003119 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003120 }
Victor Stinner37296e82010-06-10 13:36:23 +00003121 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003122 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003123 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003125#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003126 else if (strcmp(lower, "mbcs") == 0)
3127 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3128 PyUnicode_GET_SIZE(unicode),
3129 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003130#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003131 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003132 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134
3135 /* Encode via the codec registry */
3136 v = PyCodec_Encode(unicode, encoding, errors);
3137 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003138 return NULL;
3139
3140 /* The normal path */
3141 if (PyBytes_Check(v))
3142 return v;
3143
3144 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003145 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003146 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003147 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003148
3149 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3150 "encoder %s returned bytearray instead of bytes",
3151 encoding);
3152 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003153 Py_DECREF(v);
3154 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003155 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003156
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003157 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3158 Py_DECREF(v);
3159 return b;
3160 }
3161
3162 PyErr_Format(PyExc_TypeError,
3163 "encoder did not return a bytes object (type=%.400s)",
3164 Py_TYPE(v)->tp_name);
3165 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003166 return NULL;
3167}
3168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003171 const char *encoding,
3172 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003173{
3174 PyObject *v;
3175
3176 if (!PyUnicode_Check(unicode)) {
3177 PyErr_BadArgument();
3178 goto onError;
3179 }
3180
3181 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003183
3184 /* Encode via the codec registry */
3185 v = PyCodec_Encode(unicode, encoding, errors);
3186 if (v == NULL)
3187 goto onError;
3188 if (!PyUnicode_Check(v)) {
3189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003190 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191 Py_TYPE(v)->tp_name);
3192 Py_DECREF(v);
3193 goto onError;
3194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003196
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 return NULL;
3199}
3200
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003201PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003202PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003203 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003204 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3205}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003206
Christian Heimes5894ba72007-11-04 11:43:14 +00003207PyObject*
3208PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3209{
Victor Stinner99b95382011-07-04 14:23:54 +02003210#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003211 return PyUnicode_DecodeMBCS(s, size, NULL);
3212#elif defined(__APPLE__)
3213 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3214#else
Victor Stinner793b5312011-04-27 00:24:21 +02003215 PyInterpreterState *interp = PyThreadState_GET()->interp;
3216 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3217 cannot use it to encode and decode filenames before it is loaded. Load
3218 the Python codec requires to encode at least its own filename. Use the C
3219 version of the locale codec until the codec registry is initialized and
3220 the Python codec is loaded.
3221
3222 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3223 cannot only rely on it: check also interp->fscodec_initialized for
3224 subinterpreters. */
3225 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003226 return PyUnicode_Decode(s, size,
3227 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003228 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003229 }
3230 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003231 /* locale encoding with surrogateescape */
3232 wchar_t *wchar;
3233 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003234 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003235
3236 if (s[size] != '\0' || size != strlen(s)) {
3237 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3238 return NULL;
3239 }
3240
Victor Stinner168e1172010-10-16 23:16:16 +00003241 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003242 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003243 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003244
Victor Stinner168e1172010-10-16 23:16:16 +00003245 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003246 PyMem_Free(wchar);
3247 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248 }
Victor Stinnerad158722010-10-27 00:25:46 +00003249#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250}
3251
Martin v. Löwis011e8422009-05-05 04:43:17 +00003252
3253int
3254PyUnicode_FSConverter(PyObject* arg, void* addr)
3255{
3256 PyObject *output = NULL;
3257 Py_ssize_t size;
3258 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003259 if (arg == NULL) {
3260 Py_DECREF(*(PyObject**)addr);
3261 return 1;
3262 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003263 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003264 output = arg;
3265 Py_INCREF(output);
3266 }
3267 else {
3268 arg = PyUnicode_FromObject(arg);
3269 if (!arg)
3270 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003271 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003272 Py_DECREF(arg);
3273 if (!output)
3274 return 0;
3275 if (!PyBytes_Check(output)) {
3276 Py_DECREF(output);
3277 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3278 return 0;
3279 }
3280 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003281 size = PyBytes_GET_SIZE(output);
3282 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003283 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003284 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003285 Py_DECREF(output);
3286 return 0;
3287 }
3288 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003289 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003290}
3291
3292
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003293int
3294PyUnicode_FSDecoder(PyObject* arg, void* addr)
3295{
3296 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003297 if (arg == NULL) {
3298 Py_DECREF(*(PyObject**)addr);
3299 return 1;
3300 }
3301 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 if (PyUnicode_READY(arg))
3303 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003304 output = arg;
3305 Py_INCREF(output);
3306 }
3307 else {
3308 arg = PyBytes_FromObject(arg);
3309 if (!arg)
3310 return 0;
3311 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3312 PyBytes_GET_SIZE(arg));
3313 Py_DECREF(arg);
3314 if (!output)
3315 return 0;
3316 if (!PyUnicode_Check(output)) {
3317 Py_DECREF(output);
3318 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3319 return 0;
3320 }
3321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003323 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003324 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3325 Py_DECREF(output);
3326 return 0;
3327 }
3328 *(PyObject**)addr = output;
3329 return Py_CLEANUP_SUPPORTED;
3330}
3331
3332
Martin v. Löwis5b222132007-06-10 09:51:05 +00003333char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003334PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003335{
Christian Heimesf3863112007-11-22 07:46:41 +00003336 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3338
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003339 if (!PyUnicode_Check(unicode)) {
3340 PyErr_BadArgument();
3341 return NULL;
3342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003344 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003346 if (PyUnicode_UTF8(unicode) == NULL) {
3347 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3349 if (bytes == NULL)
3350 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003351 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3352 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003353 Py_DECREF(bytes);
3354 return NULL;
3355 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003356 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3357 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 Py_DECREF(bytes);
3359 }
3360
3361 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003362 *psize = PyUnicode_UTF8_LENGTH(unicode);
3363 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003364}
3365
3366char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3370}
3371
3372#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003373static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374#endif
3375
3376
3377Py_UNICODE *
3378PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3379{
3380 PyUnicodeObject *u;
3381 const unsigned char *one_byte;
3382#if SIZEOF_WCHAR_T == 4
3383 const Py_UCS2 *two_bytes;
3384#else
3385 const Py_UCS4 *four_bytes;
3386 const Py_UCS4 *ucs4_end;
3387 Py_ssize_t num_surrogates;
3388#endif
3389 wchar_t *w;
3390 wchar_t *wchar_end;
3391
3392 if (!PyUnicode_Check(unicode)) {
3393 PyErr_BadArgument();
3394 return NULL;
3395 }
3396 u = (PyUnicodeObject*)unicode;
3397 if (_PyUnicode_WSTR(u) == NULL) {
3398 /* Non-ASCII compact unicode object */
3399 assert(_PyUnicode_KIND(u) != 0);
3400 assert(PyUnicode_IS_READY(u));
3401
3402#ifdef Py_DEBUG
3403 ++unicode_as_unicode_calls;
3404#endif
3405
3406 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3407#if SIZEOF_WCHAR_T == 2
3408 four_bytes = PyUnicode_4BYTE_DATA(u);
3409 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3410 num_surrogates = 0;
3411
3412 for (; four_bytes < ucs4_end; ++four_bytes) {
3413 if (*four_bytes > 0xFFFF)
3414 ++num_surrogates;
3415 }
3416
3417 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3418 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3419 if (!_PyUnicode_WSTR(u)) {
3420 PyErr_NoMemory();
3421 return NULL;
3422 }
3423 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3424
3425 w = _PyUnicode_WSTR(u);
3426 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3427 four_bytes = PyUnicode_4BYTE_DATA(u);
3428 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3429 if (*four_bytes > 0xFFFF) {
3430 /* encode surrogate pair in this case */
3431 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3432 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3433 }
3434 else
3435 *w = *four_bytes;
3436
3437 if (w > wchar_end) {
3438 assert(0 && "Miscalculated string end");
3439 }
3440 }
3441 *w = 0;
3442#else
3443 /* sizeof(wchar_t) == 4 */
3444 Py_FatalError("Impossible unicode object state, wstr and str "
3445 "should share memory already.");
3446 return NULL;
3447#endif
3448 }
3449 else {
3450 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3451 (_PyUnicode_LENGTH(u) + 1));
3452 if (!_PyUnicode_WSTR(u)) {
3453 PyErr_NoMemory();
3454 return NULL;
3455 }
3456 if (!PyUnicode_IS_COMPACT_ASCII(u))
3457 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3458 w = _PyUnicode_WSTR(u);
3459 wchar_end = w + _PyUnicode_LENGTH(u);
3460
3461 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3462 one_byte = PyUnicode_1BYTE_DATA(u);
3463 for (; w < wchar_end; ++one_byte, ++w)
3464 *w = *one_byte;
3465 /* null-terminate the wstr */
3466 *w = 0;
3467 }
3468 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3469#if SIZEOF_WCHAR_T == 4
3470 two_bytes = PyUnicode_2BYTE_DATA(u);
3471 for (; w < wchar_end; ++two_bytes, ++w)
3472 *w = *two_bytes;
3473 /* null-terminate the wstr */
3474 *w = 0;
3475#else
3476 /* sizeof(wchar_t) == 2 */
3477 PyObject_FREE(_PyUnicode_WSTR(u));
3478 _PyUnicode_WSTR(u) = NULL;
3479 Py_FatalError("Impossible unicode object state, wstr "
3480 "and str should share memory already.");
3481 return NULL;
3482#endif
3483 }
3484 else {
3485 assert(0 && "This should never happen.");
3486 }
3487 }
3488 }
3489 if (size != NULL)
3490 *size = PyUnicode_WSTR_LENGTH(u);
3491 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494Py_UNICODE *
3495PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498}
3499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500
Alexander Belopolsky40018472011-02-26 01:02:56 +00003501Py_ssize_t
3502PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503{
3504 if (!PyUnicode_Check(unicode)) {
3505 PyErr_BadArgument();
3506 goto onError;
3507 }
3508 return PyUnicode_GET_SIZE(unicode);
3509
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 return -1;
3512}
3513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514Py_ssize_t
3515PyUnicode_GetLength(PyObject *unicode)
3516{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003517 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 PyErr_BadArgument();
3519 return -1;
3520 }
3521
3522 return PyUnicode_GET_LENGTH(unicode);
3523}
3524
3525Py_UCS4
3526PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3527{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003528 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3529 PyErr_BadArgument();
3530 return (Py_UCS4)-1;
3531 }
3532 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3533 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 return (Py_UCS4)-1;
3535 }
3536 return PyUnicode_READ_CHAR(unicode, index);
3537}
3538
3539int
3540PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3541{
3542 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003543 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003544 return -1;
3545 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003546 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3547 PyErr_SetString(PyExc_IndexError, "string index out of range");
3548 return -1;
3549 }
3550 if (_PyUnicode_Dirty(unicode))
3551 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3553 index, ch);
3554 return 0;
3555}
3556
Alexander Belopolsky40018472011-02-26 01:02:56 +00003557const char *
3558PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003559{
Victor Stinner42cb4622010-09-01 19:39:01 +00003560 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003561}
3562
Victor Stinner554f3f02010-06-16 23:33:54 +00003563/* create or adjust a UnicodeDecodeError */
3564static void
3565make_decode_exception(PyObject **exceptionObject,
3566 const char *encoding,
3567 const char *input, Py_ssize_t length,
3568 Py_ssize_t startpos, Py_ssize_t endpos,
3569 const char *reason)
3570{
3571 if (*exceptionObject == NULL) {
3572 *exceptionObject = PyUnicodeDecodeError_Create(
3573 encoding, input, length, startpos, endpos, reason);
3574 }
3575 else {
3576 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3577 goto onError;
3578 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3579 goto onError;
3580 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3581 goto onError;
3582 }
3583 return;
3584
3585onError:
3586 Py_DECREF(*exceptionObject);
3587 *exceptionObject = NULL;
3588}
3589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590/* error handling callback helper:
3591 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003592 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 and adjust various state variables.
3594 return 0 on success, -1 on error
3595*/
3596
Alexander Belopolsky40018472011-02-26 01:02:56 +00003597static int
3598unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003599 const char *encoding, const char *reason,
3600 const char **input, const char **inend, Py_ssize_t *startinpos,
3601 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3602 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003604 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605
3606 PyObject *restuple = NULL;
3607 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003609 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t requiredsize;
3611 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003612 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003613 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 int res = -1;
3616
3617 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 *errorHandler = PyCodec_LookupError(errors);
3619 if (*errorHandler == NULL)
3620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 }
3622
Victor Stinner554f3f02010-06-16 23:33:54 +00003623 make_decode_exception(exceptionObject,
3624 encoding,
3625 *input, *inend - *input,
3626 *startinpos, *endinpos,
3627 reason);
3628 if (*exceptionObject == NULL)
3629 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630
3631 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3632 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003635 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 }
3638 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003640
3641 /* Copy back the bytes variables, which might have been modified by the
3642 callback */
3643 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3644 if (!inputobj)
3645 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003646 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003648 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003649 *input = PyBytes_AS_STRING(inputobj);
3650 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003651 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003652 /* we can DECREF safely, as the exception has another reference,
3653 so the object won't go away. */
3654 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003658 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3660 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662
3663 /* need more space? (at least enough for what we
3664 have+the replacement+the rest of the string (starting
3665 at the new input position), so we won't have to check space
3666 when there are no errors in the rest of the string) */
3667 repptr = PyUnicode_AS_UNICODE(repunicode);
3668 repsize = PyUnicode_GET_SIZE(repunicode);
3669 requiredsize = *outpos + repsize + insize-newpos;
3670 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 if (requiredsize<2*outsize)
3672 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003673 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 goto onError;
3675 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003678 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 Py_UNICODE_COPY(*outptr, repptr, repsize);
3680 *outptr += repsize;
3681 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 /* we made it! */
3684 res = 0;
3685
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 Py_XDECREF(restuple);
3688 return res;
3689}
3690
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691/* --- UTF-7 Codec -------------------------------------------------------- */
3692
Antoine Pitrou244651a2009-05-04 18:56:13 +00003693/* See RFC2152 for details. We encode conservatively and decode liberally. */
3694
3695/* Three simple macros defining base-64. */
3696
3697/* Is c a base-64 character? */
3698
3699#define IS_BASE64(c) \
3700 (((c) >= 'A' && (c) <= 'Z') || \
3701 ((c) >= 'a' && (c) <= 'z') || \
3702 ((c) >= '0' && (c) <= '9') || \
3703 (c) == '+' || (c) == '/')
3704
3705/* given that c is a base-64 character, what is its base-64 value? */
3706
3707#define FROM_BASE64(c) \
3708 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3709 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3710 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3711 (c) == '+' ? 62 : 63)
3712
3713/* What is the base-64 character of the bottom 6 bits of n? */
3714
3715#define TO_BASE64(n) \
3716 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3717
3718/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3719 * decoded as itself. We are permissive on decoding; the only ASCII
3720 * byte not decoding to itself is the + which begins a base64
3721 * string. */
3722
3723#define DECODE_DIRECT(c) \
3724 ((c) <= 127 && (c) != '+')
3725
3726/* The UTF-7 encoder treats ASCII characters differently according to
3727 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3728 * the above). See RFC2152. This array identifies these different
3729 * sets:
3730 * 0 : "Set D"
3731 * alphanumeric and '(),-./:?
3732 * 1 : "Set O"
3733 * !"#$%&*;<=>@[]^_`{|}
3734 * 2 : "whitespace"
3735 * ht nl cr sp
3736 * 3 : special (must be base64 encoded)
3737 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3738 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003739
Tim Petersced69f82003-09-16 20:30:58 +00003740static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003741char utf7_category[128] = {
3742/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3743 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3744/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3745 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3746/* sp ! " # $ % & ' ( ) * + , - . / */
3747 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3748/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3750/* @ A B C D E F G H I J K L M N O */
3751 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3752/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3754/* ` a b c d e f g h i j k l m n o */
3755 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3756/* p q r s t u v w x y z { | } ~ del */
3757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758};
3759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760/* ENCODE_DIRECT: this character should be encoded as itself. The
3761 * answer depends on whether we are encoding set O as itself, and also
3762 * on whether we are encoding whitespace as itself. RFC2152 makes it
3763 * clear that the answers to these questions vary between
3764 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003765
Antoine Pitrou244651a2009-05-04 18:56:13 +00003766#define ENCODE_DIRECT(c, directO, directWS) \
3767 ((c) < 128 && (c) > 0 && \
3768 ((utf7_category[(c)] == 0) || \
3769 (directWS && (utf7_category[(c)] == 2)) || \
3770 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771
Alexander Belopolsky40018472011-02-26 01:02:56 +00003772PyObject *
3773PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003774 Py_ssize_t size,
3775 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003776{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003777 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3778}
3779
Antoine Pitrou244651a2009-05-04 18:56:13 +00003780/* The decoder. The only state we preserve is our read position,
3781 * i.e. how many characters we have consumed. So if we end in the
3782 * middle of a shift sequence we have to back off the read position
3783 * and the output to the beginning of the sequence, otherwise we lose
3784 * all the shift state (seen bits, number of bits seen, high
3785 * surrogate). */
3786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003789 Py_ssize_t size,
3790 const char *errors,
3791 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003794 Py_ssize_t startinpos;
3795 Py_ssize_t endinpos;
3796 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003797 const char *e;
3798 PyUnicodeObject *unicode;
3799 Py_UNICODE *p;
3800 const char *errmsg = "";
3801 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802 Py_UNICODE *shiftOutStart;
3803 unsigned int base64bits = 0;
3804 unsigned long base64buffer = 0;
3805 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 PyObject *errorHandler = NULL;
3807 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808
3809 unicode = _PyUnicode_New(size);
3810 if (!unicode)
3811 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003812 if (size == 0) {
3813 if (consumed)
3814 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003816 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820 e = s + size;
3821
3822 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003825 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003826
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827 if (inShift) { /* in a base-64 section */
3828 if (IS_BASE64(ch)) { /* consume a base-64 character */
3829 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3830 base64bits += 6;
3831 s++;
3832 if (base64bits >= 16) {
3833 /* we have enough bits for a UTF-16 value */
3834 Py_UNICODE outCh = (Py_UNICODE)
3835 (base64buffer >> (base64bits-16));
3836 base64bits -= 16;
3837 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3838 if (surrogate) {
3839 /* expecting a second surrogate */
3840 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3841#ifdef Py_UNICODE_WIDE
3842 *p++ = (((surrogate & 0x3FF)<<10)
3843 | (outCh & 0x3FF)) + 0x10000;
3844#else
3845 *p++ = surrogate;
3846 *p++ = outCh;
3847#endif
3848 surrogate = 0;
3849 }
3850 else {
3851 surrogate = 0;
3852 errmsg = "second surrogate missing";
3853 goto utf7Error;
3854 }
3855 }
3856 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3857 /* first surrogate */
3858 surrogate = outCh;
3859 }
3860 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3861 errmsg = "unexpected second surrogate";
3862 goto utf7Error;
3863 }
3864 else {
3865 *p++ = outCh;
3866 }
3867 }
3868 }
3869 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 inShift = 0;
3871 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003872 if (surrogate) {
3873 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003874 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 if (base64bits > 0) { /* left-over bits */
3877 if (base64bits >= 6) {
3878 /* We've seen at least one base-64 character */
3879 errmsg = "partial character in shift sequence";
3880 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 else {
3883 /* Some bits remain; they should be zero */
3884 if (base64buffer != 0) {
3885 errmsg = "non-zero padding bits in shift sequence";
3886 goto utf7Error;
3887 }
3888 }
3889 }
3890 if (ch != '-') {
3891 /* '-' is absorbed; other terminating
3892 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 *p++ = ch;
3894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 }
3896 }
3897 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003899 s++; /* consume '+' */
3900 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003901 s++;
3902 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003903 }
3904 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 shiftOutStart = p;
3907 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 }
3909 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 *p++ = ch;
3912 s++;
3913 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 else {
3915 startinpos = s-starts;
3916 s++;
3917 errmsg = "unexpected special character";
3918 goto utf7Error;
3919 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 outpos = p-PyUnicode_AS_UNICODE(unicode);
3923 endinpos = s-starts;
3924 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 errors, &errorHandler,
3926 "utf7", errmsg,
3927 &starts, &e, &startinpos, &endinpos, &exc, &s,
3928 &unicode, &outpos, &p))
3929 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003930 }
3931
Antoine Pitrou244651a2009-05-04 18:56:13 +00003932 /* end of string */
3933
3934 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3935 /* if we're in an inconsistent state, that's an error */
3936 if (surrogate ||
3937 (base64bits >= 6) ||
3938 (base64bits > 0 && base64buffer != 0)) {
3939 outpos = p-PyUnicode_AS_UNICODE(unicode);
3940 endinpos = size;
3941 if (unicode_decode_call_errorhandler(
3942 errors, &errorHandler,
3943 "utf7", "unterminated shift sequence",
3944 &starts, &e, &startinpos, &endinpos, &exc, &s,
3945 &unicode, &outpos, &p))
3946 goto onError;
3947 if (s < e)
3948 goto restart;
3949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951
3952 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003953 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954 if (inShift) {
3955 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003956 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 }
3958 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003959 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962
Victor Stinnerfe226c02011-10-03 03:52:20 +02003963 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003964 goto onError;
3965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003968#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003969 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 Py_DECREF(unicode);
3971 return NULL;
3972 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003973#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003974 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 return (PyObject *)unicode;
3976
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 Py_XDECREF(errorHandler);
3979 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 Py_DECREF(unicode);
3981 return NULL;
3982}
3983
3984
Alexander Belopolsky40018472011-02-26 01:02:56 +00003985PyObject *
3986PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003987 Py_ssize_t size,
3988 int base64SetO,
3989 int base64WhiteSpace,
3990 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003991{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003992 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003994 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003995 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003996 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 unsigned int base64bits = 0;
3998 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003999 char * out;
4000 char * start;
4001
4002 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004005 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004006 return PyErr_NoMemory();
4007
Antoine Pitrou244651a2009-05-04 18:56:13 +00004008 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004009 if (v == NULL)
4010 return NULL;
4011
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004012 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004013 for (;i < size; ++i) {
4014 Py_UNICODE ch = s[i];
4015
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 if (inShift) {
4017 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4018 /* shifting out */
4019 if (base64bits) { /* output remaining bits */
4020 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4021 base64buffer = 0;
4022 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 }
4024 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004025 /* Characters not in the BASE64 set implicitly unshift the sequence
4026 so no '-' is required, except if the character is itself a '-' */
4027 if (IS_BASE64(ch) || ch == '-') {
4028 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004029 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 *out++ = (char) ch;
4031 }
4032 else {
4033 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004034 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004036 else { /* not in a shift sequence */
4037 if (ch == '+') {
4038 *out++ = '+';
4039 *out++ = '-';
4040 }
4041 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4042 *out++ = (char) ch;
4043 }
4044 else {
4045 *out++ = '+';
4046 inShift = 1;
4047 goto encode_char;
4048 }
4049 }
4050 continue;
4051encode_char:
4052#ifdef Py_UNICODE_WIDE
4053 if (ch >= 0x10000) {
4054 /* code first surrogate */
4055 base64bits += 16;
4056 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4057 while (base64bits >= 6) {
4058 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4059 base64bits -= 6;
4060 }
4061 /* prepare second surrogate */
4062 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4063 }
4064#endif
4065 base64bits += 16;
4066 base64buffer = (base64buffer << 16) | ch;
4067 while (base64bits >= 6) {
4068 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4069 base64bits -= 6;
4070 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004071 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004072 if (base64bits)
4073 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4074 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004075 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004076 if (_PyBytes_Resize(&v, out - start) < 0)
4077 return NULL;
4078 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004079}
4080
Antoine Pitrou244651a2009-05-04 18:56:13 +00004081#undef IS_BASE64
4082#undef FROM_BASE64
4083#undef TO_BASE64
4084#undef DECODE_DIRECT
4085#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004086
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087/* --- UTF-8 Codec -------------------------------------------------------- */
4088
Tim Petersced69f82003-09-16 20:30:58 +00004089static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004091 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4092 illegal prefix. See RFC 3629 for details */
4093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4105 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4107 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4108 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109};
4110
Alexander Belopolsky40018472011-02-26 01:02:56 +00004111PyObject *
4112PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004113 Py_ssize_t size,
4114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
Walter Dörwald69652032004-09-07 20:24:22 +00004116 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4117}
4118
Antoine Pitrouab868312009-01-10 15:40:25 +00004119/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4120#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4121
4122/* Mask to quickly check whether a C 'long' contains a
4123 non-ASCII, UTF8-encoded char. */
4124#if (SIZEOF_LONG == 8)
4125# define ASCII_CHAR_MASK 0x8080808080808080L
4126#elif (SIZEOF_LONG == 4)
4127# define ASCII_CHAR_MASK 0x80808080L
4128#else
4129# error C 'long' size should be either 4 or 8!
4130#endif
4131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132/* Scans a UTF-8 string and returns the maximum character to be expected,
4133 the size of the decoded unicode string and if any major errors were
4134 encountered.
4135
4136 This function does check basic UTF-8 sanity, it does however NOT CHECK
4137 if the string contains surrogates, and if all continuation bytes are
4138 within the correct ranges, these checks are performed in
4139 PyUnicode_DecodeUTF8Stateful.
4140
4141 If it sets has_errors to 1, it means the value of unicode_size and max_char
4142 will be bogus and you should not rely on useful information in them.
4143 */
4144static Py_UCS4
4145utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4146 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4147 int *has_errors)
4148{
4149 Py_ssize_t n;
4150 Py_ssize_t char_count = 0;
4151 Py_UCS4 max_char = 127, new_max;
4152 Py_UCS4 upper_bound;
4153 const unsigned char *p = (const unsigned char *)s;
4154 const unsigned char *end = p + string_size;
4155 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4156 int err = 0;
4157
4158 for (; p < end && !err; ++p, ++char_count) {
4159 /* Only check value if it's not a ASCII char... */
4160 if (*p < 0x80) {
4161 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4162 an explanation. */
4163 if (!((size_t) p & LONG_PTR_MASK)) {
4164 /* Help register allocation */
4165 register const unsigned char *_p = p;
4166 while (_p < aligned_end) {
4167 unsigned long value = *(unsigned long *) _p;
4168 if (value & ASCII_CHAR_MASK)
4169 break;
4170 _p += SIZEOF_LONG;
4171 char_count += SIZEOF_LONG;
4172 }
4173 p = _p;
4174 if (p == end)
4175 break;
4176 }
4177 }
4178 if (*p >= 0x80) {
4179 n = utf8_code_length[*p];
4180 new_max = max_char;
4181 switch (n) {
4182 /* invalid start byte */
4183 case 0:
4184 err = 1;
4185 break;
4186 case 2:
4187 /* Code points between 0x00FF and 0x07FF inclusive.
4188 Approximate the upper bound of the code point,
4189 if this flips over 255 we can be sure it will be more
4190 than 255 and the string will need 2 bytes per code coint,
4191 if it stays under or equal to 255, we can be sure 1 byte
4192 is enough.
4193 ((*p & 0b00011111) << 6) | 0b00111111 */
4194 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4195 if (max_char < upper_bound)
4196 new_max = upper_bound;
4197 /* Ensure we track at least that we left ASCII space. */
4198 if (new_max < 128)
4199 new_max = 128;
4200 break;
4201 case 3:
4202 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4203 always > 255 and <= 65535 and will always need 2 bytes. */
4204 if (max_char < 65535)
4205 new_max = 65535;
4206 break;
4207 case 4:
4208 /* Code point will be above 0xFFFF for sure in this case. */
4209 new_max = 65537;
4210 break;
4211 /* Internal error, this should be caught by the first if */
4212 case 1:
4213 default:
4214 assert(0 && "Impossible case in utf8_max_char_and_size");
4215 err = 1;
4216 }
4217 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004218 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 --n;
4220 /* Check if the follow up chars are all valid continuation bytes */
4221 if (n >= 1) {
4222 const unsigned char *cont;
4223 if ((p + n) >= end) {
4224 if (consumed == 0)
4225 /* incomplete data, non-incremental decoding */
4226 err = 1;
4227 break;
4228 }
4229 for (cont = p + 1; cont < (p + n); ++cont) {
4230 if ((*cont & 0xc0) != 0x80) {
4231 err = 1;
4232 break;
4233 }
4234 }
4235 p += n;
4236 }
4237 else
4238 err = 1;
4239 max_char = new_max;
4240 }
4241 }
4242
4243 if (unicode_size)
4244 *unicode_size = char_count;
4245 if (has_errors)
4246 *has_errors = err;
4247 return max_char;
4248}
4249
4250/* Similar to PyUnicode_WRITE but can also write into wstr field
4251 of the legacy unicode representation */
4252#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4253 do { \
4254 const int k_ = (kind); \
4255 if (k_ == PyUnicode_WCHAR_KIND) \
4256 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4257 else if (k_ == PyUnicode_1BYTE_KIND) \
4258 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4259 else if (k_ == PyUnicode_2BYTE_KIND) \
4260 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4261 else \
4262 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4263 } while (0)
4264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265PyObject *
4266PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 Py_ssize_t size,
4268 const char *errors,
4269 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004273 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004274 Py_ssize_t startinpos;
4275 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004276 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004278 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 PyObject *errorHandler = NULL;
4280 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 Py_UCS4 maxchar = 0;
4282 Py_ssize_t unicode_size;
4283 Py_ssize_t i;
4284 int kind;
4285 void *data;
4286 int has_errors;
4287 Py_UNICODE *error_outptr;
4288#if SIZEOF_WCHAR_T == 2
4289 Py_ssize_t wchar_offset = 0;
4290#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291
Walter Dörwald69652032004-09-07 20:24:22 +00004292 if (size == 0) {
4293 if (consumed)
4294 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4298 consumed, &has_errors);
4299 if (has_errors) {
4300 unicode = _PyUnicode_New(size);
4301 if (!unicode)
4302 return NULL;
4303 kind = PyUnicode_WCHAR_KIND;
4304 data = PyUnicode_AS_UNICODE(unicode);
4305 assert(data != NULL);
4306 }
4307 else {
4308 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4309 if (!unicode)
4310 return NULL;
4311 /* When the string is ASCII only, just use memcpy and return.
4312 unicode_size may be != size if there is an incomplete UTF-8
4313 sequence at the end of the ASCII block. */
4314 if (maxchar < 128 && size == unicode_size) {
4315 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4316 return (PyObject *)unicode;
4317 }
4318 kind = PyUnicode_KIND(unicode);
4319 data = PyUnicode_DATA(unicode);
4320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004324 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
4326 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004327 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
4329 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004330 /* Fast path for runs of ASCII characters. Given that common UTF-8
4331 input will consist of an overwhelming majority of ASCII
4332 characters, we try to optimize for this case by checking
4333 as many characters as a C 'long' can contain.
4334 First, check if we can do an aligned read, as most CPUs have
4335 a penalty for unaligned reads.
4336 */
4337 if (!((size_t) s & LONG_PTR_MASK)) {
4338 /* Help register allocation */
4339 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004341 while (_s < aligned_end) {
4342 /* Read a whole long at a time (either 4 or 8 bytes),
4343 and do a fast unrolled copy if it only contains ASCII
4344 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345 unsigned long value = *(unsigned long *) _s;
4346 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004347 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4350 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4351 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004352#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4355 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4356 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004357#endif
4358 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004360 }
4361 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004363 if (s == e)
4364 break;
4365 ch = (unsigned char)*s;
4366 }
4367 }
4368
4369 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004370 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 s++;
4372 continue;
4373 }
4374
4375 n = utf8_code_length[ch];
4376
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004377 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 if (consumed)
4379 break;
4380 else {
4381 errmsg = "unexpected end of data";
4382 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004383 endinpos = startinpos+1;
4384 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4385 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 goto utf8Error;
4387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389
4390 switch (n) {
4391
4392 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004393 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 startinpos = s-starts;
4395 endinpos = startinpos+1;
4396 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397
4398 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004399 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 startinpos = s-starts;
4401 endinpos = startinpos+1;
4402 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403
4404 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004405 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004408 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto utf8Error;
4410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004412 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 break;
4415
4416 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004417 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4418 will result in surrogates in range d800-dfff. Surrogates are
4419 not valid UTF-8 so they are rejected.
4420 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4421 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004422 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004423 (s[2] & 0xc0) != 0x80 ||
4424 ((unsigned char)s[0] == 0xE0 &&
4425 (unsigned char)s[1] < 0xA0) ||
4426 ((unsigned char)s[0] == 0xED &&
4427 (unsigned char)s[1] > 0x9F)) {
4428 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004430 endinpos = startinpos + 1;
4431
4432 /* if s[1] first two bits are 1 and 0, then the invalid
4433 continuation byte is s[2], so increment endinpos by 1,
4434 if not, s[1] is invalid and endinpos doesn't need to
4435 be incremented. */
4436 if ((s[1] & 0xC0) == 0x80)
4437 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 goto utf8Error;
4439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004442 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004443 break;
4444
4445 case 4:
4446 if ((s[1] & 0xc0) != 0x80 ||
4447 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004448 (s[3] & 0xc0) != 0x80 ||
4449 ((unsigned char)s[0] == 0xF0 &&
4450 (unsigned char)s[1] < 0x90) ||
4451 ((unsigned char)s[0] == 0xF4 &&
4452 (unsigned char)s[1] > 0x8F)) {
4453 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004455 endinpos = startinpos + 1;
4456 if ((s[1] & 0xC0) == 0x80) {
4457 endinpos++;
4458 if ((s[2] & 0xC0) == 0x80)
4459 endinpos++;
4460 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 goto utf8Error;
4462 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004463 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004464 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4465 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 /* If the string is flexible or we have native UCS-4, write
4468 directly.. */
4469 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4470 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 else {
4473 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 /* translate from 10000..10FFFF to 0..FFFF */
4476 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 /* high surrogate = top 10 bits added to D800 */
4479 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4480 (Py_UNICODE)(0xD800 + (ch >> 10)));
4481
4482 /* low surrogate = bottom 10 bits added to DC00 */
4483 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4484 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4485 }
4486#if SIZEOF_WCHAR_T == 2
4487 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004488#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 }
4491 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004493
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004495 /* If this is not yet a resizable string, make it one.. */
4496 if (kind != PyUnicode_WCHAR_KIND) {
4497 const Py_UNICODE *u;
4498 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4499 if (!new_unicode)
4500 goto onError;
4501 u = PyUnicode_AsUnicode((PyObject *)unicode);
4502 if (!u)
4503 goto onError;
4504#if SIZEOF_WCHAR_T == 2
4505 i += wchar_offset;
4506#endif
4507 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4508 Py_DECREF(unicode);
4509 unicode = new_unicode;
4510 kind = 0;
4511 data = PyUnicode_AS_UNICODE(new_unicode);
4512 assert(data != NULL);
4513 }
4514 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 if (unicode_decode_call_errorhandler(
4516 errors, &errorHandler,
4517 "utf8", errmsg,
4518 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004521 /* Update data because unicode_decode_call_errorhandler might have
4522 re-created or resized the unicode object. */
4523 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 /* Ensure the unicode_size calculation above was correct: */
4527 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4528
Walter Dörwald69652032004-09-07 20:24:22 +00004529 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532 /* Adjust length and ready string when it contained errors and
4533 is of the old resizable kind. */
4534 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004535 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536 goto onError;
4537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 Py_XDECREF(errorHandler);
4540 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004541#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004542 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543 Py_DECREF(unicode);
4544 return NULL;
4545 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004546#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004547 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 return (PyObject *)unicode;
4549
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 Py_XDECREF(errorHandler);
4552 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 Py_DECREF(unicode);
4554 return NULL;
4555}
4556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004558
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004559#ifdef __APPLE__
4560
4561/* Simplified UTF-8 decoder using surrogateescape error handler,
4562 used to decode the command line arguments on Mac OS X. */
4563
4564wchar_t*
4565_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4566{
4567 int n;
4568 const char *e;
4569 wchar_t *unicode, *p;
4570
4571 /* Note: size will always be longer than the resulting Unicode
4572 character count */
4573 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4574 PyErr_NoMemory();
4575 return NULL;
4576 }
4577 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4578 if (!unicode)
4579 return NULL;
4580
4581 /* Unpack UTF-8 encoded data */
4582 p = unicode;
4583 e = s + size;
4584 while (s < e) {
4585 Py_UCS4 ch = (unsigned char)*s;
4586
4587 if (ch < 0x80) {
4588 *p++ = (wchar_t)ch;
4589 s++;
4590 continue;
4591 }
4592
4593 n = utf8_code_length[ch];
4594 if (s + n > e) {
4595 goto surrogateescape;
4596 }
4597
4598 switch (n) {
4599 case 0:
4600 case 1:
4601 goto surrogateescape;
4602
4603 case 2:
4604 if ((s[1] & 0xc0) != 0x80)
4605 goto surrogateescape;
4606 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4607 assert ((ch > 0x007F) && (ch <= 0x07FF));
4608 *p++ = (wchar_t)ch;
4609 break;
4610
4611 case 3:
4612 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4613 will result in surrogates in range d800-dfff. Surrogates are
4614 not valid UTF-8 so they are rejected.
4615 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4616 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4617 if ((s[1] & 0xc0) != 0x80 ||
4618 (s[2] & 0xc0) != 0x80 ||
4619 ((unsigned char)s[0] == 0xE0 &&
4620 (unsigned char)s[1] < 0xA0) ||
4621 ((unsigned char)s[0] == 0xED &&
4622 (unsigned char)s[1] > 0x9F)) {
4623
4624 goto surrogateescape;
4625 }
4626 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4627 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004629 break;
4630
4631 case 4:
4632 if ((s[1] & 0xc0) != 0x80 ||
4633 (s[2] & 0xc0) != 0x80 ||
4634 (s[3] & 0xc0) != 0x80 ||
4635 ((unsigned char)s[0] == 0xF0 &&
4636 (unsigned char)s[1] < 0x90) ||
4637 ((unsigned char)s[0] == 0xF4 &&
4638 (unsigned char)s[1] > 0x8F)) {
4639 goto surrogateescape;
4640 }
4641 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4642 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4643 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4644
4645#if SIZEOF_WCHAR_T == 4
4646 *p++ = (wchar_t)ch;
4647#else
4648 /* compute and append the two surrogates: */
4649
4650 /* translate from 10000..10FFFF to 0..FFFF */
4651 ch -= 0x10000;
4652
4653 /* high surrogate = top 10 bits added to D800 */
4654 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4655
4656 /* low surrogate = bottom 10 bits added to DC00 */
4657 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4658#endif
4659 break;
4660 }
4661 s += n;
4662 continue;
4663
4664 surrogateescape:
4665 *p++ = 0xDC00 + ch;
4666 s++;
4667 }
4668 *p = L'\0';
4669 return unicode;
4670}
4671
4672#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674/* Primary internal function which creates utf8 encoded bytes objects.
4675
4676 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004677 and allocate exactly as much space needed at the end. Else allocate the
4678 maximum possible needed (4 result bytes per Unicode character), and return
4679 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004680*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004681PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683{
Tim Peters602f7402002-04-27 18:03:26 +00004684#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004685
Guido van Rossum98297ee2007-11-06 21:34:58 +00004686 Py_ssize_t i; /* index into s of next input byte */
4687 PyObject *result; /* result string object */
4688 char *p; /* next free byte in output buffer */
4689 Py_ssize_t nallocated; /* number of result bytes allocated */
4690 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004691 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004692 PyObject *errorHandler = NULL;
4693 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004694 int kind;
4695 void *data;
4696 Py_ssize_t size;
4697 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4698#if SIZEOF_WCHAR_T == 2
4699 Py_ssize_t wchar_offset = 0;
4700#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 if (!PyUnicode_Check(unicode)) {
4703 PyErr_BadArgument();
4704 return NULL;
4705 }
4706
4707 if (PyUnicode_READY(unicode) == -1)
4708 return NULL;
4709
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004710 if (PyUnicode_UTF8(unicode))
4711 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4712 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004713
4714 kind = PyUnicode_KIND(unicode);
4715 data = PyUnicode_DATA(unicode);
4716 size = PyUnicode_GET_LENGTH(unicode);
4717
Tim Peters602f7402002-04-27 18:03:26 +00004718 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719
Tim Peters602f7402002-04-27 18:03:26 +00004720 if (size <= MAX_SHORT_UNICHARS) {
4721 /* Write into the stack buffer; nallocated can't overflow.
4722 * At the end, we'll allocate exactly as much heap space as it
4723 * turns out we need.
4724 */
4725 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004726 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004727 p = stackbuf;
4728 }
4729 else {
4730 /* Overallocate on the heap, and give the excess back at the end. */
4731 nallocated = size * 4;
4732 if (nallocated / 4 != size) /* overflow! */
4733 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004734 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004735 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004736 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004737 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004738 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004739
Tim Peters602f7402002-04-27 18:03:26 +00004740 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004742
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004743 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004744 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004748 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004749 *p++ = (char)(0xc0 | (ch >> 6));
4750 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004751 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004752 Py_ssize_t newpos;
4753 PyObject *rep;
4754 Py_ssize_t repsize, k, startpos;
4755 startpos = i-1;
4756#if SIZEOF_WCHAR_T == 2
4757 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004758#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759 rep = unicode_encode_call_errorhandler(
4760 errors, &errorHandler, "utf-8", "surrogates not allowed",
4761 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4762 &exc, startpos, startpos+1, &newpos);
4763 if (!rep)
4764 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 if (PyBytes_Check(rep))
4767 repsize = PyBytes_GET_SIZE(rep);
4768 else
4769 repsize = PyUnicode_GET_SIZE(rep);
4770
4771 if (repsize > 4) {
4772 Py_ssize_t offset;
4773
4774 if (result == NULL)
4775 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004776 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4780 /* integer overflow */
4781 PyErr_NoMemory();
4782 goto error;
4783 }
4784 nallocated += repsize - 4;
4785 if (result != NULL) {
4786 if (_PyBytes_Resize(&result, nallocated) < 0)
4787 goto error;
4788 } else {
4789 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004790 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 goto error;
4792 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4793 }
4794 p = PyBytes_AS_STRING(result) + offset;
4795 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 if (PyBytes_Check(rep)) {
4798 char *prep = PyBytes_AS_STRING(rep);
4799 for(k = repsize; k > 0; k--)
4800 *p++ = *prep++;
4801 } else /* rep is unicode */ {
4802 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4803 Py_UNICODE c;
4804
4805 for(k=0; k<repsize; k++) {
4806 c = prep[k];
4807 if (0x80 <= c) {
4808 raise_encode_exception(&exc, "utf-8",
4809 PyUnicode_AS_UNICODE(unicode),
4810 size, i-1, i,
4811 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004812 goto error;
4813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004815 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004818 } else if (ch < 0x10000) {
4819 *p++ = (char)(0xe0 | (ch >> 12));
4820 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4821 *p++ = (char)(0x80 | (ch & 0x3f));
4822 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004823 /* Encode UCS4 Unicode ordinals */
4824 *p++ = (char)(0xf0 | (ch >> 18));
4825 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4826 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4827 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828#if SIZEOF_WCHAR_T == 2
4829 wchar_offset++;
4830#endif
Tim Peters602f7402002-04-27 18:03:26 +00004831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004833
Guido van Rossum98297ee2007-11-06 21:34:58 +00004834 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004835 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004836 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004837 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004838 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004839 }
4840 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004841 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004843 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004844 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004847 Py_XDECREF(errorHandler);
4848 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004849 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004850 error:
4851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
4853 Py_XDECREF(result);
4854 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004855
Tim Peters602f7402002-04-27 18:03:26 +00004856#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857}
4858
Alexander Belopolsky40018472011-02-26 01:02:56 +00004859PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4861 Py_ssize_t size,
4862 const char *errors)
4863{
4864 PyObject *v, *unicode;
4865
4866 unicode = PyUnicode_FromUnicode(s, size);
4867 if (unicode == NULL)
4868 return NULL;
4869 v = _PyUnicode_AsUTF8String(unicode, errors);
4870 Py_DECREF(unicode);
4871 return v;
4872}
4873
4874PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004875PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878}
4879
Walter Dörwald41980ca2007-08-16 21:55:45 +00004880/* --- UTF-32 Codec ------------------------------------------------------- */
4881
4882PyObject *
4883PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 Py_ssize_t size,
4885 const char *errors,
4886 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887{
4888 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4889}
4890
4891PyObject *
4892PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 Py_ssize_t size,
4894 const char *errors,
4895 int *byteorder,
4896 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897{
4898 const char *starts = s;
4899 Py_ssize_t startinpos;
4900 Py_ssize_t endinpos;
4901 Py_ssize_t outpos;
4902 PyUnicodeObject *unicode;
4903 Py_UNICODE *p;
4904#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004905 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004906 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907#else
4908 const int pairs = 0;
4909#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004910 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911 int bo = 0; /* assume native ordering by default */
4912 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913 /* Offsets from q for retrieving bytes in the right order. */
4914#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4915 int iorder[] = {0, 1, 2, 3};
4916#else
4917 int iorder[] = {3, 2, 1, 0};
4918#endif
4919 PyObject *errorHandler = NULL;
4920 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004921
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922 q = (unsigned char *)s;
4923 e = q + size;
4924
4925 if (byteorder)
4926 bo = *byteorder;
4927
4928 /* Check for BOM marks (U+FEFF) in the input and adjust current
4929 byte order setting accordingly. In native mode, the leading BOM
4930 mark is skipped, in all other modes, it is copied to the output
4931 stream as-is (giving a ZWNBSP character). */
4932 if (bo == 0) {
4933 if (size >= 4) {
4934 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (bom == 0x0000FEFF) {
4938 q += 4;
4939 bo = -1;
4940 }
4941 else if (bom == 0xFFFE0000) {
4942 q += 4;
4943 bo = 1;
4944 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 if (bom == 0x0000FEFF) {
4947 q += 4;
4948 bo = 1;
4949 }
4950 else if (bom == 0xFFFE0000) {
4951 q += 4;
4952 bo = -1;
4953 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 }
4957
4958 if (bo == -1) {
4959 /* force LE */
4960 iorder[0] = 0;
4961 iorder[1] = 1;
4962 iorder[2] = 2;
4963 iorder[3] = 3;
4964 }
4965 else if (bo == 1) {
4966 /* force BE */
4967 iorder[0] = 3;
4968 iorder[1] = 2;
4969 iorder[2] = 1;
4970 iorder[3] = 0;
4971 }
4972
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004973 /* On narrow builds we split characters outside the BMP into two
4974 codepoints => count how much extra space we need. */
4975#ifndef Py_UNICODE_WIDE
4976 for (qq = q; qq < e; qq += 4)
4977 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4978 pairs++;
4979#endif
4980
4981 /* This might be one to much, because of a BOM */
4982 unicode = _PyUnicode_New((size+3)/4+pairs);
4983 if (!unicode)
4984 return NULL;
4985 if (size == 0)
4986 return (PyObject *)unicode;
4987
4988 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004989 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 Py_UCS4 ch;
4993 /* remaining bytes at the end? (size should be divisible by 4) */
4994 if (e-q<4) {
4995 if (consumed)
4996 break;
4997 errmsg = "truncated data";
4998 startinpos = ((const char *)q)-starts;
4999 endinpos = ((const char *)e)-starts;
5000 goto utf32Error;
5001 /* The remaining input chars are ignored if the callback
5002 chooses to skip the input */
5003 }
5004 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5005 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 if (ch >= 0x110000)
5008 {
5009 errmsg = "codepoint not in range(0x110000)";
5010 startinpos = ((const char *)q)-starts;
5011 endinpos = startinpos+4;
5012 goto utf32Error;
5013 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 if (ch >= 0x10000)
5016 {
5017 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5018 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5019 }
5020 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *p++ = ch;
5023 q += 4;
5024 continue;
5025 utf32Error:
5026 outpos = p-PyUnicode_AS_UNICODE(unicode);
5027 if (unicode_decode_call_errorhandler(
5028 errors, &errorHandler,
5029 "utf32", errmsg,
5030 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5031 &unicode, &outpos, &p))
5032 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005033 }
5034
5035 if (byteorder)
5036 *byteorder = bo;
5037
5038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040
5041 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005042 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 goto onError;
5044
5045 Py_XDECREF(errorHandler);
5046 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005047#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005048 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005049 Py_DECREF(unicode);
5050 return NULL;
5051 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005052#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005053 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 return (PyObject *)unicode;
5055
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 Py_DECREF(unicode);
5058 Py_XDECREF(errorHandler);
5059 Py_XDECREF(exc);
5060 return NULL;
5061}
5062
5063PyObject *
5064PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 Py_ssize_t size,
5066 const char *errors,
5067 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005069 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005071 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005073 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074#else
5075 const int pairs = 0;
5076#endif
5077 /* Offsets from p for storing byte pairs in the right order. */
5078#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5079 int iorder[] = {0, 1, 2, 3};
5080#else
5081 int iorder[] = {3, 2, 1, 0};
5082#endif
5083
Benjamin Peterson29060642009-01-31 22:14:21 +00005084#define STORECHAR(CH) \
5085 do { \
5086 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5087 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5088 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5089 p[iorder[0]] = (CH) & 0xff; \
5090 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091 } while(0)
5092
5093 /* In narrow builds we can output surrogate pairs as one codepoint,
5094 so we need less space. */
5095#ifndef Py_UNICODE_WIDE
5096 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5098 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5099 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005101 nsize = (size - pairs + (byteorder == 0));
5102 bytesize = nsize * 4;
5103 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005105 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 if (v == NULL)
5107 return NULL;
5108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005113 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114
5115 if (byteorder == -1) {
5116 /* force LE */
5117 iorder[0] = 0;
5118 iorder[1] = 1;
5119 iorder[2] = 2;
5120 iorder[3] = 3;
5121 }
5122 else if (byteorder == 1) {
5123 /* force BE */
5124 iorder[0] = 3;
5125 iorder[1] = 2;
5126 iorder[2] = 1;
5127 iorder[3] = 0;
5128 }
5129
5130 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5134 Py_UCS4 ch2 = *s;
5135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5136 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5137 s++;
5138 size--;
5139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005140 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141#endif
5142 STORECHAR(ch);
5143 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005144
5145 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005146 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005147#undef STORECHAR
5148}
5149
Alexander Belopolsky40018472011-02-26 01:02:56 +00005150PyObject *
5151PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152{
5153 if (!PyUnicode_Check(unicode)) {
5154 PyErr_BadArgument();
5155 return NULL;
5156 }
5157 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 PyUnicode_GET_SIZE(unicode),
5159 NULL,
5160 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005161}
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163/* --- UTF-16 Codec ------------------------------------------------------- */
5164
Tim Peters772747b2001-08-09 22:21:55 +00005165PyObject *
5166PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_ssize_t size,
5168 const char *errors,
5169 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Walter Dörwald69652032004-09-07 20:24:22 +00005171 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5172}
5173
Antoine Pitrouab868312009-01-10 15:40:25 +00005174/* Two masks for fast checking of whether a C 'long' may contain
5175 UTF16-encoded surrogate characters. This is an efficient heuristic,
5176 assuming that non-surrogate characters with a code point >= 0x8000 are
5177 rare in most input.
5178 FAST_CHAR_MASK is used when the input is in native byte ordering,
5179 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005180*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005181#if (SIZEOF_LONG == 8)
5182# define FAST_CHAR_MASK 0x8000800080008000L
5183# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5184#elif (SIZEOF_LONG == 4)
5185# define FAST_CHAR_MASK 0x80008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5187#else
5188# error C 'long' size should be either 4 or 8!
5189#endif
5190
Walter Dörwald69652032004-09-07 20:24:22 +00005191PyObject *
5192PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_ssize_t size,
5194 const char *errors,
5195 int *byteorder,
5196 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005199 Py_ssize_t startinpos;
5200 Py_ssize_t endinpos;
5201 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 PyUnicodeObject *unicode;
5203 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005204 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005205 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005207 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005208 /* Offsets from q for retrieving byte pairs in the right order. */
5209#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5210 int ihi = 1, ilo = 0;
5211#else
5212 int ihi = 0, ilo = 1;
5213#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 /* Note: size will always be longer than the resulting Unicode
5218 character count */
5219 unicode = _PyUnicode_New(size);
5220 if (!unicode)
5221 return NULL;
5222 if (size == 0)
5223 return (PyObject *)unicode;
5224
5225 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005226 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005227 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005228 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
5230 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005231 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005233 /* Check for BOM marks (U+FEFF) in the input and adjust current
5234 byte order setting accordingly. In native mode, the leading BOM
5235 mark is skipped, in all other modes, it is copied to the output
5236 stream as-is (giving a ZWNBSP character). */
5237 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005238 if (size >= 2) {
5239 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005240#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 if (bom == 0xFEFF) {
5242 q += 2;
5243 bo = -1;
5244 }
5245 else if (bom == 0xFFFE) {
5246 q += 2;
5247 bo = 1;
5248 }
Tim Petersced69f82003-09-16 20:30:58 +00005249#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 if (bom == 0xFEFF) {
5251 q += 2;
5252 bo = 1;
5253 }
5254 else if (bom == 0xFFFE) {
5255 q += 2;
5256 bo = -1;
5257 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005258#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005259 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Tim Peters772747b2001-08-09 22:21:55 +00005262 if (bo == -1) {
5263 /* force LE */
5264 ihi = 1;
5265 ilo = 0;
5266 }
5267 else if (bo == 1) {
5268 /* force BE */
5269 ihi = 0;
5270 ilo = 1;
5271 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005272#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5273 native_ordering = ilo < ihi;
5274#else
5275 native_ordering = ilo > ihi;
5276#endif
Tim Peters772747b2001-08-09 22:21:55 +00005277
Antoine Pitrouab868312009-01-10 15:40:25 +00005278 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005279 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005281 /* First check for possible aligned read of a C 'long'. Unaligned
5282 reads are more expensive, better to defer to another iteration. */
5283 if (!((size_t) q & LONG_PTR_MASK)) {
5284 /* Fast path for runs of non-surrogate chars. */
5285 register const unsigned char *_q = q;
5286 Py_UNICODE *_p = p;
5287 if (native_ordering) {
5288 /* Native ordering is simple: as long as the input cannot
5289 possibly contain a surrogate char, do an unrolled copy
5290 of several 16-bit code points to the target object.
5291 The non-surrogate check is done on several input bytes
5292 at a time (as many as a C 'long' can contain). */
5293 while (_q < aligned_end) {
5294 unsigned long data = * (unsigned long *) _q;
5295 if (data & FAST_CHAR_MASK)
5296 break;
5297 _p[0] = ((unsigned short *) _q)[0];
5298 _p[1] = ((unsigned short *) _q)[1];
5299#if (SIZEOF_LONG == 8)
5300 _p[2] = ((unsigned short *) _q)[2];
5301 _p[3] = ((unsigned short *) _q)[3];
5302#endif
5303 _q += SIZEOF_LONG;
5304 _p += SIZEOF_LONG / 2;
5305 }
5306 }
5307 else {
5308 /* Byteswapped ordering is similar, but we must decompose
5309 the copy bytewise, and take care of zero'ing out the
5310 upper bytes if the target object is in 32-bit units
5311 (that is, in UCS-4 builds). */
5312 while (_q < aligned_end) {
5313 unsigned long data = * (unsigned long *) _q;
5314 if (data & SWAPPED_FAST_CHAR_MASK)
5315 break;
5316 /* Zero upper bytes in UCS-4 builds */
5317#if (Py_UNICODE_SIZE > 2)
5318 _p[0] = 0;
5319 _p[1] = 0;
5320#if (SIZEOF_LONG == 8)
5321 _p[2] = 0;
5322 _p[3] = 0;
5323#endif
5324#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005325 /* Issue #4916; UCS-4 builds on big endian machines must
5326 fill the two last bytes of each 4-byte unit. */
5327#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5328# define OFF 2
5329#else
5330# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005331#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005332 ((unsigned char *) _p)[OFF + 1] = _q[0];
5333 ((unsigned char *) _p)[OFF + 0] = _q[1];
5334 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5335 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5336#if (SIZEOF_LONG == 8)
5337 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5338 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5339 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5340 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5341#endif
5342#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 _q += SIZEOF_LONG;
5344 _p += SIZEOF_LONG / 2;
5345 }
5346 }
5347 p = _p;
5348 q = _q;
5349 if (q >= e)
5350 break;
5351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353
Benjamin Peterson14339b62009-01-31 16:36:08 +00005354 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005355
5356 if (ch < 0xD800 || ch > 0xDFFF) {
5357 *p++ = ch;
5358 continue;
5359 }
5360
5361 /* UTF-16 code pair: */
5362 if (q > e) {
5363 errmsg = "unexpected end of data";
5364 startinpos = (((const char *)q) - 2) - starts;
5365 endinpos = ((const char *)e) + 1 - starts;
5366 goto utf16Error;
5367 }
5368 if (0xD800 <= ch && ch <= 0xDBFF) {
5369 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5370 q += 2;
5371 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005372#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 *p++ = ch;
5374 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005377#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 continue;
5379 }
5380 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005381 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 startinpos = (((const char *)q)-4)-starts;
5383 endinpos = startinpos+2;
5384 goto utf16Error;
5385 }
5386
Benjamin Peterson14339b62009-01-31 16:36:08 +00005387 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 errmsg = "illegal encoding";
5389 startinpos = (((const char *)q)-2)-starts;
5390 endinpos = startinpos+2;
5391 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005392
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 utf16Error:
5394 outpos = p - PyUnicode_AS_UNICODE(unicode);
5395 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005396 errors,
5397 &errorHandler,
5398 "utf16", errmsg,
5399 &starts,
5400 (const char **)&e,
5401 &startinpos,
5402 &endinpos,
5403 &exc,
5404 (const char **)&q,
5405 &unicode,
5406 &outpos,
5407 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005410 /* remaining byte at the end? (size should be even) */
5411 if (e == q) {
5412 if (!consumed) {
5413 errmsg = "truncated data";
5414 startinpos = ((const char *)q) - starts;
5415 endinpos = ((const char *)e) + 1 - starts;
5416 outpos = p - PyUnicode_AS_UNICODE(unicode);
5417 if (unicode_decode_call_errorhandler(
5418 errors,
5419 &errorHandler,
5420 "utf16", errmsg,
5421 &starts,
5422 (const char **)&e,
5423 &startinpos,
5424 &endinpos,
5425 &exc,
5426 (const char **)&q,
5427 &unicode,
5428 &outpos,
5429 &p))
5430 goto onError;
5431 /* The remaining input chars are ignored if the callback
5432 chooses to skip the input */
5433 }
5434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
5436 if (byteorder)
5437 *byteorder = bo;
5438
Walter Dörwald69652032004-09-07 20:24:22 +00005439 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005443 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 goto onError;
5445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005448#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005449 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 Py_DECREF(unicode);
5451 return NULL;
5452 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005453#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005454 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return (PyObject *)unicode;
5456
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 Py_XDECREF(errorHandler);
5460 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 return NULL;
5462}
5463
Antoine Pitrouab868312009-01-10 15:40:25 +00005464#undef FAST_CHAR_MASK
5465#undef SWAPPED_FAST_CHAR_MASK
5466
Tim Peters772747b2001-08-09 22:21:55 +00005467PyObject *
5468PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 Py_ssize_t size,
5470 const char *errors,
5471 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005473 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005474 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005475 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005476#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005477 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005478#else
5479 const int pairs = 0;
5480#endif
Tim Peters772747b2001-08-09 22:21:55 +00005481 /* Offsets from p for storing byte pairs in the right order. */
5482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5483 int ihi = 1, ilo = 0;
5484#else
5485 int ihi = 0, ilo = 1;
5486#endif
5487
Benjamin Peterson29060642009-01-31 22:14:21 +00005488#define STORECHAR(CH) \
5489 do { \
5490 p[ihi] = ((CH) >> 8) & 0xff; \
5491 p[ilo] = (CH) & 0xff; \
5492 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005493 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005495#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005496 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 if (s[i] >= 0x10000)
5498 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005499#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005500 /* 2 * (size + pairs + (byteorder == 0)) */
5501 if (size > PY_SSIZE_T_MAX ||
5502 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005504 nsize = size + pairs + (byteorder == 0);
5505 bytesize = nsize * 2;
5506 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005508 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 if (v == NULL)
5510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005512 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005515 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005516 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005517
5518 if (byteorder == -1) {
5519 /* force LE */
5520 ihi = 1;
5521 ilo = 0;
5522 }
5523 else if (byteorder == 1) {
5524 /* force BE */
5525 ihi = 0;
5526 ilo = 1;
5527 }
5528
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005529 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 Py_UNICODE ch = *s++;
5531 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005532#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 if (ch >= 0x10000) {
5534 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5535 ch = 0xD800 | ((ch-0x10000) >> 10);
5536 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005537#endif
Tim Peters772747b2001-08-09 22:21:55 +00005538 STORECHAR(ch);
5539 if (ch2)
5540 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005541 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005542
5543 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005544 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005545#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
Alexander Belopolsky40018472011-02-26 01:02:56 +00005548PyObject *
5549PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
5551 if (!PyUnicode_Check(unicode)) {
5552 PyErr_BadArgument();
5553 return NULL;
5554 }
5555 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 PyUnicode_GET_SIZE(unicode),
5557 NULL,
5558 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
5561/* --- Unicode Escape Codec ----------------------------------------------- */
5562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5564 if all the escapes in the string make it still a valid ASCII string.
5565 Returns -1 if any escapes were found which cause the string to
5566 pop out of ASCII range. Otherwise returns the length of the
5567 required buffer to hold the string.
5568 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005569static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5571{
5572 const unsigned char *p = (const unsigned char *)s;
5573 const unsigned char *end = p + size;
5574 Py_ssize_t length = 0;
5575
5576 if (size < 0)
5577 return -1;
5578
5579 for (; p < end; ++p) {
5580 if (*p > 127) {
5581 /* Non-ASCII */
5582 return -1;
5583 }
5584 else if (*p != '\\') {
5585 /* Normal character */
5586 ++length;
5587 }
5588 else {
5589 /* Backslash-escape, check next char */
5590 ++p;
5591 /* Escape sequence reaches till end of string or
5592 non-ASCII follow-up. */
5593 if (p >= end || *p > 127)
5594 return -1;
5595 switch (*p) {
5596 case '\n':
5597 /* backslash + \n result in zero characters */
5598 break;
5599 case '\\': case '\'': case '\"':
5600 case 'b': case 'f': case 't':
5601 case 'n': case 'r': case 'v': case 'a':
5602 ++length;
5603 break;
5604 case '0': case '1': case '2': case '3':
5605 case '4': case '5': case '6': case '7':
5606 case 'x': case 'u': case 'U': case 'N':
5607 /* these do not guarantee ASCII characters */
5608 return -1;
5609 default:
5610 /* count the backslash + the other character */
5611 length += 2;
5612 }
5613 }
5614 }
5615 return length;
5616}
5617
5618/* Similar to PyUnicode_WRITE but either write into wstr field
5619 or treat string as ASCII. */
5620#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5621 do { \
5622 if ((kind) != PyUnicode_WCHAR_KIND) \
5623 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5624 else \
5625 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5626 } while (0)
5627
5628#define WRITE_WSTR(buf, index, value) \
5629 assert(kind == PyUnicode_WCHAR_KIND), \
5630 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5631
5632
Fredrik Lundh06d12682001-01-24 07:59:11 +00005633static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005634
Alexander Belopolsky40018472011-02-26 01:02:56 +00005635PyObject *
5636PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005637 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005641 Py_ssize_t startinpos;
5642 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005647 char* message;
5648 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 PyObject *errorHandler = NULL;
5650 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651 Py_ssize_t ascii_length;
5652 Py_ssize_t i;
5653 int kind;
5654 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 ascii_length = length_of_escaped_ascii_string(s, size);
5657
5658 /* After length_of_escaped_ascii_string() there are two alternatives,
5659 either the string is pure ASCII with named escapes like \n, etc.
5660 and we determined it's exact size (common case)
5661 or it contains \x, \u, ... escape sequences. then we create a
5662 legacy wchar string and resize it at the end of this function. */
5663 if (ascii_length >= 0) {
5664 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5665 if (!v)
5666 goto onError;
5667 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5668 kind = PyUnicode_1BYTE_KIND;
5669 data = PyUnicode_DATA(v);
5670 }
5671 else {
5672 /* Escaped strings will always be longer than the resulting
5673 Unicode string, so we start with size here and then reduce the
5674 length after conversion to the true value.
5675 (but if the error callback returns a long replacement string
5676 we'll have to allocate more space) */
5677 v = _PyUnicode_New(size);
5678 if (!v)
5679 goto onError;
5680 kind = PyUnicode_WCHAR_KIND;
5681 data = PyUnicode_AS_UNICODE(v);
5682 }
5683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 if (size == 0)
5685 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 while (s < end) {
5690 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005691 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 if (kind == PyUnicode_WCHAR_KIND) {
5695 assert(i < _PyUnicode_WSTR_LENGTH(v));
5696 }
5697 else {
5698 /* The only case in which i == ascii_length is a backslash
5699 followed by a newline. */
5700 assert(i <= ascii_length);
5701 }
5702
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 /* Non-escape characters are interpreted as Unicode ordinals */
5704 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 continue;
5707 }
5708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 /* \ - Escapes */
5711 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005712 c = *s++;
5713 if (s > end)
5714 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005715
5716 if (kind == PyUnicode_WCHAR_KIND) {
5717 assert(i < _PyUnicode_WSTR_LENGTH(v));
5718 }
5719 else {
5720 /* The only case in which i == ascii_length is a backslash
5721 followed by a newline. */
5722 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5723 }
5724
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005725 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005729 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5730 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5731 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5732 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5733 /* FF */
5734 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5735 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5736 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5737 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5738 /* VT */
5739 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5740 /* BEL, not classic C */
5741 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 case '0': case '1': case '2': case '3':
5745 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005747 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005748 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005749 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005750 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 break;
5754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* hex escapes */
5756 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 2;
5759 message = "truncated \\xXX escape";
5760 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 digits = 4;
5765 message = "truncated \\uXXXX escape";
5766 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770 digits = 8;
5771 message = "truncated \\UXXXXXXXX escape";
5772 hexescape:
5773 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 if (s+digits>end) {
5776 endinpos = size;
5777 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 errors, &errorHandler,
5779 "unicodeescape", "end of string in escape sequence",
5780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 goto nextByte;
5785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 for (j = 0; j < digits; ++j) {
5787 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005788 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005789 endinpos = (s+j+1)-starts;
5790 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 errors, &errorHandler,
5793 "unicodeescape", message,
5794 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005796 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 }
5800 chr = (chr<<4) & ~0xF;
5801 if (c >= '0' && c <= '9')
5802 chr += c - '0';
5803 else if (c >= 'a' && c <= 'f')
5804 chr += 10 + c - 'a';
5805 else
5806 chr += 10 + c - 'A';
5807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005808 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005809 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 /* _decoding_error will have already written into the
5811 target buffer. */
5812 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005814 /* when we get here, chr is a 32-bit unicode character */
5815 if (chr <= 0xffff)
5816 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005818 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005819 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005821#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005822 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005823#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005824 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005825 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5826 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005827#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005828 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 errors, &errorHandler,
5833 "unicodeescape", "illegal Unicode character",
5834 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005836 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005838 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 break;
5840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 case 'N':
5843 message = "malformed \\N character escape";
5844 if (ucnhash_CAPI == NULL) {
5845 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005846 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5847 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 if (ucnhash_CAPI == NULL)
5849 goto ucnhashError;
5850 }
5851 if (*s == '{') {
5852 const char *start = s+1;
5853 /* look for the closing brace */
5854 while (*s != '}' && s < end)
5855 s++;
5856 if (s > start && s < end && *s == '}') {
5857 /* found a name. look it up in the unicode database */
5858 message = "unknown Unicode character name";
5859 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5861 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005862 goto store;
5863 }
5864 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005866 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 errors, &errorHandler,
5869 "unicodeescape", message,
5870 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005873 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005874 break;
5875
5876 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005877 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 message = "\\ at end of string";
5880 s--;
5881 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005882 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 errors, &errorHandler,
5885 "unicodeescape", message,
5886 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005888 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005889 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005890 }
5891 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005892 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5893 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005894 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005895 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 /* Ensure the length prediction worked in case of ASCII strings */
5901 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5902
Victor Stinnerfe226c02011-10-03 03:52:20 +02005903 if (kind == PyUnicode_WCHAR_KIND)
5904 {
5905 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5906 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005907 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005908 Py_XDECREF(errorHandler);
5909 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005910#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005911 if (_PyUnicode_READY_REPLACE(&v)) {
5912 Py_DECREF(v);
5913 return NULL;
5914 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005915#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005916 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005918
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005920 PyErr_SetString(
5921 PyExc_UnicodeError,
5922 "\\N escapes not supported (can't load unicodedata module)"
5923 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005927 return NULL;
5928
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 return NULL;
5934}
5935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005936#undef WRITE_ASCII_OR_WSTR
5937#undef WRITE_WSTR
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939/* Return a Unicode-Escape string version of the Unicode object.
5940
5941 If quotes is true, the string is enclosed in u"" or u'' quotes as
5942 appropriate.
5943
5944*/
5945
Alexander Belopolsky40018472011-02-26 01:02:56 +00005946PyObject *
5947PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005948 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005950 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005953#ifdef Py_UNICODE_WIDE
5954 const Py_ssize_t expandsize = 10;
5955#else
5956 const Py_ssize_t expandsize = 6;
5957#endif
5958
Thomas Wouters89f507f2006-12-13 04:49:30 +00005959 /* XXX(nnorwitz): rather than over-allocating, it would be
5960 better to choose a different scheme. Perhaps scan the
5961 first N-chars of the string and allocate based on that size.
5962 */
5963 /* Initial allocation is based on the longest-possible unichr
5964 escape.
5965
5966 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5967 unichr, so in this case it's the longest unichr escape. In
5968 narrow (UTF-16) builds this is five chars per source unichr
5969 since there are two unichrs in the surrogate pair, so in narrow
5970 (UTF-16) builds it's not the longest unichr escape.
5971
5972 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5973 so in the narrow (UTF-16) build case it's the longest unichr
5974 escape.
5975 */
5976
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005977 if (size == 0)
5978 return PyBytes_FromStringAndSize(NULL, 0);
5979
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005980 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005982
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 2
5985 + expandsize*size
5986 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 if (repr == NULL)
5988 return NULL;
5989
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005990 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 while (size-- > 0) {
5993 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005994
Walter Dörwald79e913e2007-05-12 11:08:06 +00005995 /* Escape backslashes */
5996 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 *p++ = '\\';
5998 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005999 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006000 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006001
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006002#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006003 /* Map 21-bit characters to '\U00xxxxxx' */
6004 else if (ch >= 0x10000) {
6005 *p++ = '\\';
6006 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006007 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6008 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6009 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6010 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6011 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6012 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6013 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6014 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006016 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006017#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6019 else if (ch >= 0xD800 && ch < 0xDC00) {
6020 Py_UNICODE ch2;
6021 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 ch2 = *s++;
6024 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006025 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6027 *p++ = '\\';
6028 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006029 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6030 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6031 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6032 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6033 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6034 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6035 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6036 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 continue;
6038 }
6039 /* Fall through: isolated surrogates are copied as-is */
6040 s--;
6041 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006042 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006043#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006046 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 *p++ = '\\';
6048 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006049 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6050 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6051 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6052 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006054
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006055 /* Map special whitespace to '\t', \n', '\r' */
6056 else if (ch == '\t') {
6057 *p++ = '\\';
6058 *p++ = 't';
6059 }
6060 else if (ch == '\n') {
6061 *p++ = '\\';
6062 *p++ = 'n';
6063 }
6064 else if (ch == '\r') {
6065 *p++ = '\\';
6066 *p++ = 'r';
6067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006068
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006069 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006070 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006072 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006073 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6074 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006076
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 /* Copy everything else as-is */
6078 else
6079 *p++ = (char) ch;
6080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006082 assert(p - PyBytes_AS_STRING(repr) > 0);
6083 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6084 return NULL;
6085 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086}
6087
Alexander Belopolsky40018472011-02-26 01:02:56 +00006088PyObject *
6089PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006091 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 if (!PyUnicode_Check(unicode)) {
6093 PyErr_BadArgument();
6094 return NULL;
6095 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006096 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6097 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006098 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099}
6100
6101/* --- Raw Unicode Escape Codec ------------------------------------------- */
6102
Alexander Belopolsky40018472011-02-26 01:02:56 +00006103PyObject *
6104PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006105 Py_ssize_t size,
6106 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006109 Py_ssize_t startinpos;
6110 Py_ssize_t endinpos;
6111 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 const char *end;
6115 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 PyObject *errorHandler = NULL;
6117 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 /* Escaped strings will always be longer than the resulting
6120 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 length after conversion to the true value. (But decoding error
6122 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 v = _PyUnicode_New(size);
6124 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 end = s + size;
6130 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 unsigned char c;
6132 Py_UCS4 x;
6133 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006134 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* Non-escape characters are interpreted as Unicode ordinals */
6137 if (*s != '\\') {
6138 *p++ = (unsigned char)*s++;
6139 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 startinpos = s-starts;
6142
6143 /* \u-escapes are only interpreted iff the number of leading
6144 backslashes if odd */
6145 bs = s;
6146 for (;s < end;) {
6147 if (*s != '\\')
6148 break;
6149 *p++ = (unsigned char)*s++;
6150 }
6151 if (((s - bs) & 1) == 0 ||
6152 s >= end ||
6153 (*s != 'u' && *s != 'U')) {
6154 continue;
6155 }
6156 p--;
6157 count = *s=='u' ? 4 : 8;
6158 s++;
6159
6160 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6161 outpos = p-PyUnicode_AS_UNICODE(v);
6162 for (x = 0, i = 0; i < count; ++i, ++s) {
6163 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006164 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 endinpos = s-starts;
6166 if (unicode_decode_call_errorhandler(
6167 errors, &errorHandler,
6168 "rawunicodeescape", "truncated \\uXXXX",
6169 &starts, &end, &startinpos, &endinpos, &exc, &s,
6170 &v, &outpos, &p))
6171 goto onError;
6172 goto nextByte;
6173 }
6174 x = (x<<4) & ~0xF;
6175 if (c >= '0' && c <= '9')
6176 x += c - '0';
6177 else if (c >= 'a' && c <= 'f')
6178 x += 10 + c - 'a';
6179 else
6180 x += 10 + c - 'A';
6181 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006182 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 /* UCS-2 character */
6184 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006185 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* UCS-4 character. Either store directly, or as
6187 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006188#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006190#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 x -= 0x10000L;
6192 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6193 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006194#endif
6195 } else {
6196 endinpos = s-starts;
6197 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006198 if (unicode_decode_call_errorhandler(
6199 errors, &errorHandler,
6200 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 &starts, &end, &startinpos, &endinpos, &exc, &s,
6202 &v, &outpos, &p))
6203 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 nextByte:
6206 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006208 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210 Py_XDECREF(errorHandler);
6211 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006212#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006213 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006214 Py_DECREF(v);
6215 return NULL;
6216 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006217#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006218 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006220
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 Py_XDECREF(errorHandler);
6224 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 return NULL;
6226}
6227
Alexander Belopolsky40018472011-02-26 01:02:56 +00006228PyObject *
6229PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006230 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006232 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 char *p;
6234 char *q;
6235
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006236#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006237 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006238#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006239 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006240#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006241
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006242 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006244
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006245 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 if (repr == NULL)
6247 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006248 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006249 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006251 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 while (size-- > 0) {
6253 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006254#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 /* Map 32-bit characters to '\Uxxxxxxxx' */
6256 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006257 *p++ = '\\';
6258 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006259 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6260 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6261 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6262 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6263 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6264 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6265 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6266 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006267 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006268 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006269#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6271 if (ch >= 0xD800 && ch < 0xDC00) {
6272 Py_UNICODE ch2;
6273 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 ch2 = *s++;
6276 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006277 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6279 *p++ = '\\';
6280 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006281 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6282 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6283 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6284 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6285 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6286 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6287 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6288 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 continue;
6290 }
6291 /* Fall through: isolated surrogates are copied as-is */
6292 s--;
6293 size++;
6294 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006295#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 /* Map 16-bit characters to '\uxxxx' */
6297 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 *p++ = '\\';
6299 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006300 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6301 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6302 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6303 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 /* Copy everything else as-is */
6306 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 *p++ = (char) ch;
6308 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006309 size = p - q;
6310
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006311 assert(size > 0);
6312 if (_PyBytes_Resize(&repr, size) < 0)
6313 return NULL;
6314 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315}
6316
Alexander Belopolsky40018472011-02-26 01:02:56 +00006317PyObject *
6318PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006320 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006322 PyErr_BadArgument();
6323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006325 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6326 PyUnicode_GET_SIZE(unicode));
6327
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006328 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329}
6330
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006331/* --- Unicode Internal Codec ------------------------------------------- */
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
6334_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006335 Py_ssize_t size,
6336 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006337{
6338 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 Py_ssize_t startinpos;
6340 Py_ssize_t endinpos;
6341 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006342 PyUnicodeObject *v;
6343 Py_UNICODE *p;
6344 const char *end;
6345 const char *reason;
6346 PyObject *errorHandler = NULL;
6347 PyObject *exc = NULL;
6348
Neal Norwitzd43069c2006-01-08 01:12:10 +00006349#ifdef Py_UNICODE_WIDE
6350 Py_UNICODE unimax = PyUnicode_GetMax();
6351#endif
6352
Thomas Wouters89f507f2006-12-13 04:49:30 +00006353 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006354 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6355 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006357 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6358 as string was created with the old API. */
6359 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006361 p = PyUnicode_AS_UNICODE(v);
6362 end = s + size;
6363
6364 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006365 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 /* We have to sanity check the raw data, otherwise doom looms for
6367 some malformed UCS-4 data. */
6368 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006369#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006370 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006371#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006372 end-s < Py_UNICODE_SIZE
6373 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006375 startinpos = s - starts;
6376 if (end-s < Py_UNICODE_SIZE) {
6377 endinpos = end-starts;
6378 reason = "truncated input";
6379 }
6380 else {
6381 endinpos = s - starts + Py_UNICODE_SIZE;
6382 reason = "illegal code point (> 0x10FFFF)";
6383 }
6384 outpos = p - PyUnicode_AS_UNICODE(v);
6385 if (unicode_decode_call_errorhandler(
6386 errors, &errorHandler,
6387 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006388 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006389 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006390 goto onError;
6391 }
6392 }
6393 else {
6394 p++;
6395 s += Py_UNICODE_SIZE;
6396 }
6397 }
6398
Victor Stinnerfe226c02011-10-03 03:52:20 +02006399 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400 goto onError;
6401 Py_XDECREF(errorHandler);
6402 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006403#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006404 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006405 Py_DECREF(v);
6406 return NULL;
6407 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006408#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006409 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006410 return (PyObject *)v;
6411
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006413 Py_XDECREF(v);
6414 Py_XDECREF(errorHandler);
6415 Py_XDECREF(exc);
6416 return NULL;
6417}
6418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419/* --- Latin-1 Codec ------------------------------------------------------ */
6420
Alexander Belopolsky40018472011-02-26 01:02:56 +00006421PyObject *
6422PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006423 Py_ssize_t size,
6424 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006427 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428}
6429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006431static void
6432make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006433 const char *encoding,
6434 const Py_UNICODE *unicode, Py_ssize_t size,
6435 Py_ssize_t startpos, Py_ssize_t endpos,
6436 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 *exceptionObject = PyUnicodeEncodeError_Create(
6440 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 }
6442 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6444 goto onError;
6445 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6446 goto onError;
6447 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6448 goto onError;
6449 return;
6450 onError:
6451 Py_DECREF(*exceptionObject);
6452 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 }
6454}
6455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457static void
6458raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 const char *encoding,
6460 const Py_UNICODE *unicode, Py_ssize_t size,
6461 Py_ssize_t startpos, Py_ssize_t endpos,
6462 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463{
6464 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468}
6469
6470/* error handling callback helper:
6471 build arguments, call the callback and check the arguments,
6472 put the result into newpos and return the replacement string, which
6473 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474static PyObject *
6475unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006476 PyObject **errorHandler,
6477 const char *encoding, const char *reason,
6478 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6479 Py_ssize_t startpos, Py_ssize_t endpos,
6480 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006482 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483
6484 PyObject *restuple;
6485 PyObject *resunicode;
6486
6487 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 }
6492
6493 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006497
6498 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006503 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 Py_DECREF(restuple);
6505 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006507 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 &resunicode, newpos)) {
6509 Py_DECREF(restuple);
6510 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006512 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6513 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6514 Py_DECREF(restuple);
6515 return NULL;
6516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006519 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6521 Py_DECREF(restuple);
6522 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 Py_INCREF(resunicode);
6525 Py_DECREF(restuple);
6526 return resunicode;
6527}
6528
Alexander Belopolsky40018472011-02-26 01:02:56 +00006529static PyObject *
6530unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006531 Py_ssize_t size,
6532 const char *errors,
6533 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534{
6535 /* output object */
6536 PyObject *res;
6537 /* pointers to the beginning and end+1 of input */
6538 const Py_UNICODE *startp = p;
6539 const Py_UNICODE *endp = p + size;
6540 /* pointer to the beginning of the unencodable characters */
6541 /* const Py_UNICODE *badp = NULL; */
6542 /* pointer into the output */
6543 char *str;
6544 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006545 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006546 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6547 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006548 PyObject *errorHandler = NULL;
6549 PyObject *exc = NULL;
6550 /* the following variable is used for caching string comparisons
6551 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6552 int known_errorHandler = -1;
6553
6554 /* allocate enough for a simple encoding without
6555 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006556 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006557 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006558 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006560 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006561 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 ressize = size;
6563
6564 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 /* can we encode this? */
6568 if (c<limit) {
6569 /* no overflow check, because we know that the space is enough */
6570 *str++ = (char)c;
6571 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 else {
6574 Py_ssize_t unicodepos = p-startp;
6575 Py_ssize_t requiredsize;
6576 PyObject *repunicode;
6577 Py_ssize_t repsize;
6578 Py_ssize_t newpos;
6579 Py_ssize_t respos;
6580 Py_UNICODE *uni2;
6581 /* startpos for collecting unencodable chars */
6582 const Py_UNICODE *collstart = p;
6583 const Py_UNICODE *collend = p;
6584 /* find all unecodable characters */
6585 while ((collend < endp) && ((*collend)>=limit))
6586 ++collend;
6587 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6588 if (known_errorHandler==-1) {
6589 if ((errors==NULL) || (!strcmp(errors, "strict")))
6590 known_errorHandler = 1;
6591 else if (!strcmp(errors, "replace"))
6592 known_errorHandler = 2;
6593 else if (!strcmp(errors, "ignore"))
6594 known_errorHandler = 3;
6595 else if (!strcmp(errors, "xmlcharrefreplace"))
6596 known_errorHandler = 4;
6597 else
6598 known_errorHandler = 0;
6599 }
6600 switch (known_errorHandler) {
6601 case 1: /* strict */
6602 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6603 goto onError;
6604 case 2: /* replace */
6605 while (collstart++<collend)
6606 *str++ = '?'; /* fall through */
6607 case 3: /* ignore */
6608 p = collend;
6609 break;
6610 case 4: /* xmlcharrefreplace */
6611 respos = str - PyBytes_AS_STRING(res);
6612 /* determine replacement size (temporarily (mis)uses p) */
6613 for (p = collstart, repsize = 0; p < collend; ++p) {
6614 if (*p<10)
6615 repsize += 2+1+1;
6616 else if (*p<100)
6617 repsize += 2+2+1;
6618 else if (*p<1000)
6619 repsize += 2+3+1;
6620 else if (*p<10000)
6621 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006622#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 else
6624 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006625#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 else if (*p<100000)
6627 repsize += 2+5+1;
6628 else if (*p<1000000)
6629 repsize += 2+6+1;
6630 else
6631 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006632#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 }
6634 requiredsize = respos+repsize+(endp-collend);
6635 if (requiredsize > ressize) {
6636 if (requiredsize<2*ressize)
6637 requiredsize = 2*ressize;
6638 if (_PyBytes_Resize(&res, requiredsize))
6639 goto onError;
6640 str = PyBytes_AS_STRING(res) + respos;
6641 ressize = requiredsize;
6642 }
6643 /* generate replacement (temporarily (mis)uses p) */
6644 for (p = collstart; p < collend; ++p) {
6645 str += sprintf(str, "&#%d;", (int)*p);
6646 }
6647 p = collend;
6648 break;
6649 default:
6650 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6651 encoding, reason, startp, size, &exc,
6652 collstart-startp, collend-startp, &newpos);
6653 if (repunicode == NULL)
6654 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006655 if (PyBytes_Check(repunicode)) {
6656 /* Directly copy bytes result to output. */
6657 repsize = PyBytes_Size(repunicode);
6658 if (repsize > 1) {
6659 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006660 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006661 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6662 Py_DECREF(repunicode);
6663 goto onError;
6664 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006665 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006666 ressize += repsize-1;
6667 }
6668 memcpy(str, PyBytes_AsString(repunicode), repsize);
6669 str += repsize;
6670 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006672 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 /* need more space? (at least enough for what we
6675 have+the replacement+the rest of the string, so
6676 we won't have to check space for encodable characters) */
6677 respos = str - PyBytes_AS_STRING(res);
6678 repsize = PyUnicode_GET_SIZE(repunicode);
6679 requiredsize = respos+repsize+(endp-collend);
6680 if (requiredsize > ressize) {
6681 if (requiredsize<2*ressize)
6682 requiredsize = 2*ressize;
6683 if (_PyBytes_Resize(&res, requiredsize)) {
6684 Py_DECREF(repunicode);
6685 goto onError;
6686 }
6687 str = PyBytes_AS_STRING(res) + respos;
6688 ressize = requiredsize;
6689 }
6690 /* check if there is anything unencodable in the replacement
6691 and copy it to the output */
6692 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6693 c = *uni2;
6694 if (c >= limit) {
6695 raise_encode_exception(&exc, encoding, startp, size,
6696 unicodepos, unicodepos+1, reason);
6697 Py_DECREF(repunicode);
6698 goto onError;
6699 }
6700 *str = (char)c;
6701 }
6702 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006704 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006705 }
6706 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006707 /* Resize if we allocated to much */
6708 size = str - PyBytes_AS_STRING(res);
6709 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006710 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006711 if (_PyBytes_Resize(&res, size) < 0)
6712 goto onError;
6713 }
6714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 Py_XDECREF(errorHandler);
6716 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006717 return res;
6718
6719 onError:
6720 Py_XDECREF(res);
6721 Py_XDECREF(errorHandler);
6722 Py_XDECREF(exc);
6723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724}
6725
Alexander Belopolsky40018472011-02-26 01:02:56 +00006726PyObject *
6727PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006728 Py_ssize_t size,
6729 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732}
6733
Alexander Belopolsky40018472011-02-26 01:02:56 +00006734PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
6737 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 PyErr_BadArgument();
6739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006741 if (PyUnicode_READY(unicode) == -1)
6742 return NULL;
6743 /* Fast path: if it is a one-byte string, construct
6744 bytes object directly. */
6745 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6746 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6747 PyUnicode_GET_LENGTH(unicode));
6748 /* Non-Latin-1 characters present. Defer to above function to
6749 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006752 errors);
6753}
6754
6755PyObject*
6756PyUnicode_AsLatin1String(PyObject *unicode)
6757{
6758 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
6761/* --- 7-bit ASCII Codec -------------------------------------------------- */
6762
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
6764PyUnicode_DecodeASCII(const char *s,
6765 Py_ssize_t size,
6766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006770 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006771 Py_ssize_t startinpos;
6772 Py_ssize_t endinpos;
6773 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006774 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006775 int has_error;
6776 const unsigned char *p = (const unsigned char *)s;
6777 const unsigned char *end = p + size;
6778 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006779 PyObject *errorHandler = NULL;
6780 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006783 if (size == 1 && (unsigned char)s[0] < 128)
6784 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785
Victor Stinner702c7342011-10-05 13:50:52 +02006786 has_error = 0;
6787 while (p < end && !has_error) {
6788 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6789 an explanation. */
6790 if (!((size_t) p & LONG_PTR_MASK)) {
6791 /* Help register allocation */
6792 register const unsigned char *_p = p;
6793 while (_p < aligned_end) {
6794 unsigned long value = *(unsigned long *) _p;
6795 if (value & ASCII_CHAR_MASK) {
6796 has_error = 1;
6797 break;
6798 }
6799 _p += SIZEOF_LONG;
6800 }
6801 if (_p == end)
6802 break;
6803 if (has_error)
6804 break;
6805 p = _p;
6806 }
6807 if (*p & 0x80) {
6808 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006809 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006810 }
6811 else {
6812 ++p;
6813 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006814 }
Victor Stinner702c7342011-10-05 13:50:52 +02006815 if (!has_error)
6816 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006817
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 v = _PyUnicode_New(size);
6819 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006823 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824 e = s + size;
6825 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 register unsigned char c = (unsigned char)*s;
6827 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006828 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 ++s;
6830 }
6831 else {
6832 startinpos = s-starts;
6833 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006834 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 if (unicode_decode_call_errorhandler(
6836 errors, &errorHandler,
6837 "ascii", "ordinal not in range(128)",
6838 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006839 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 goto onError;
6841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 }
Victor Stinner702c7342011-10-05 13:50:52 +02006843 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6844 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 Py_XDECREF(errorHandler);
6847 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006848#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006849 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006850 Py_DECREF(v);
6851 return NULL;
6852 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006853#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006854 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006856
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 Py_XDECREF(errorHandler);
6860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 return NULL;
6862}
6863
Alexander Belopolsky40018472011-02-26 01:02:56 +00006864PyObject *
6865PyUnicode_EncodeASCII(const Py_UNICODE *p,
6866 Py_ssize_t size,
6867 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870}
6871
Alexander Belopolsky40018472011-02-26 01:02:56 +00006872PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
6875 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 PyErr_BadArgument();
6877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006879 if (PyUnicode_READY(unicode) == -1)
6880 return NULL;
6881 /* Fast path: if it is an ASCII-only string, construct bytes object
6882 directly. Else defer to above function to raise the exception. */
6883 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6884 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6885 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006888 errors);
6889}
6890
6891PyObject *
6892PyUnicode_AsASCIIString(PyObject *unicode)
6893{
6894 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Victor Stinner99b95382011-07-04 14:23:54 +02006897#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006898
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006899/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006900
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006901#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902#define NEED_RETRY
6903#endif
6904
6905/* XXX This code is limited to "true" double-byte encodings, as
6906 a) it assumes an incomplete character consists of a single byte, and
6907 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910static int
6911is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912{
6913 const char *curr = s + offset;
6914
6915 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 const char *prev = CharPrev(s, curr);
6917 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918 }
6919 return 0;
6920}
6921
6922/*
6923 * Decode MBCS string into unicode object. If 'final' is set, converts
6924 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6925 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926static int
6927decode_mbcs(PyUnicodeObject **v,
6928 const char *s, /* MBCS string */
6929 int size, /* sizeof MBCS string */
6930 int final,
6931 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006932{
6933 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 Py_ssize_t n;
6935 DWORD usize;
6936 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937
6938 assert(size >= 0);
6939
Victor Stinner554f3f02010-06-16 23:33:54 +00006940 /* check and handle 'errors' arg */
6941 if (errors==NULL || strcmp(errors, "strict")==0)
6942 flags = MB_ERR_INVALID_CHARS;
6943 else if (strcmp(errors, "ignore")==0)
6944 flags = 0;
6945 else {
6946 PyErr_Format(PyExc_ValueError,
6947 "mbcs encoding does not support errors='%s'",
6948 errors);
6949 return -1;
6950 }
6951
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 /* Skip trailing lead-byte unless 'final' is set */
6953 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955
6956 /* First get the size of the result */
6957 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006958 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6959 if (usize==0)
6960 goto mbcs_decode_error;
6961 } else
6962 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963
6964 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* Create unicode object */
6966 *v = _PyUnicode_New(usize);
6967 if (*v == NULL)
6968 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006969 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 }
6971 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 /* Extend unicode object */
6973 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006974 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 }
6977
6978 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006979 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006981 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6982 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006985 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006986
6987mbcs_decode_error:
6988 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6989 we raise a UnicodeDecodeError - else it is a 'generic'
6990 windows error
6991 */
6992 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6993 /* Ideally, we should get reason from FormatMessage - this
6994 is the Windows 2000 English version of the message
6995 */
6996 PyObject *exc = NULL;
6997 const char *reason = "No mapping for the Unicode character exists "
6998 "in the target multi-byte code page.";
6999 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
7000 if (exc != NULL) {
7001 PyCodec_StrictErrors(exc);
7002 Py_DECREF(exc);
7003 }
7004 } else {
7005 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7006 }
7007 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008}
7009
Alexander Belopolsky40018472011-02-26 01:02:56 +00007010PyObject *
7011PyUnicode_DecodeMBCSStateful(const char *s,
7012 Py_ssize_t size,
7013 const char *errors,
7014 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015{
7016 PyUnicodeObject *v = NULL;
7017 int done;
7018
7019 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007021
7022#ifdef NEED_RETRY
7023 retry:
7024 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007025 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026 else
7027#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007028 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029
7030 if (done < 0) {
7031 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033 }
7034
7035 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037
7038#ifdef NEED_RETRY
7039 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 s += done;
7041 size -= done;
7042 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043 }
7044#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02007045#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007046 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047 Py_DECREF(v);
7048 return NULL;
7049 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007050#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007051 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052 return (PyObject *)v;
7053}
7054
Alexander Belopolsky40018472011-02-26 01:02:56 +00007055PyObject *
7056PyUnicode_DecodeMBCS(const char *s,
7057 Py_ssize_t size,
7058 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007059{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7061}
7062
7063/*
7064 * Convert unicode into string object (MBCS).
7065 * Returns 0 if succeed, -1 otherwise.
7066 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007067static int
7068encode_mbcs(PyObject **repr,
7069 const Py_UNICODE *p, /* unicode */
7070 int size, /* size of unicode */
7071 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072{
Victor Stinner554f3f02010-06-16 23:33:54 +00007073 BOOL usedDefaultChar = FALSE;
7074 BOOL *pusedDefaultChar;
7075 int mbcssize;
7076 Py_ssize_t n;
7077 PyObject *exc = NULL;
7078 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
7080 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007081
Victor Stinner554f3f02010-06-16 23:33:54 +00007082 /* check and handle 'errors' arg */
7083 if (errors==NULL || strcmp(errors, "strict")==0) {
7084 flags = WC_NO_BEST_FIT_CHARS;
7085 pusedDefaultChar = &usedDefaultChar;
7086 } else if (strcmp(errors, "replace")==0) {
7087 flags = 0;
7088 pusedDefaultChar = NULL;
7089 } else {
7090 PyErr_Format(PyExc_ValueError,
7091 "mbcs encoding does not support errors='%s'",
7092 errors);
7093 return -1;
7094 }
7095
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007098 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7099 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 if (mbcssize == 0) {
7101 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7102 return -1;
7103 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007104 /* If we used a default char, then we failed! */
7105 if (pusedDefaultChar && *pusedDefaultChar)
7106 goto mbcs_encode_error;
7107 } else {
7108 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109 }
7110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Create string object */
7113 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7114 if (*repr == NULL)
7115 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007116 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 }
7118 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 /* Extend string object */
7120 n = PyBytes_Size(*repr);
7121 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7122 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123 }
7124
7125 /* Do the conversion */
7126 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007128 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7129 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7131 return -1;
7132 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007133 if (pusedDefaultChar && *pusedDefaultChar)
7134 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007135 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007137
7138mbcs_encode_error:
7139 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7140 Py_XDECREF(exc);
7141 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007142}
7143
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144PyObject *
7145PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7146 Py_ssize_t size,
7147 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007148{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149 PyObject *repr = NULL;
7150 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007151
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007155 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156 else
7157#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007158 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 Py_XDECREF(repr);
7162 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164
7165#ifdef NEED_RETRY
7166 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 p += INT_MAX;
7168 size -= INT_MAX;
7169 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170 }
7171#endif
7172
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007173 return repr;
7174}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007175
Alexander Belopolsky40018472011-02-26 01:02:56 +00007176PyObject *
7177PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007178{
7179 if (!PyUnicode_Check(unicode)) {
7180 PyErr_BadArgument();
7181 return NULL;
7182 }
7183 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 PyUnicode_GET_SIZE(unicode),
7185 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007186}
7187
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188#undef NEED_RETRY
7189
Victor Stinner99b95382011-07-04 14:23:54 +02007190#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007191
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192/* --- Character Mapping Codec -------------------------------------------- */
7193
Alexander Belopolsky40018472011-02-26 01:02:56 +00007194PyObject *
7195PyUnicode_DecodeCharmap(const char *s,
7196 Py_ssize_t size,
7197 PyObject *mapping,
7198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t startinpos;
7202 Py_ssize_t endinpos;
7203 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 PyUnicodeObject *v;
7206 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208 PyObject *errorHandler = NULL;
7209 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007210 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007211 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007212
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 /* Default to Latin-1 */
7214 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
7217 v = _PyUnicode_New(size);
7218 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007223 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007224 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 mapstring = PyUnicode_AS_UNICODE(mapping);
7226 maplen = PyUnicode_GET_SIZE(mapping);
7227 while (s < e) {
7228 unsigned char ch = *s;
7229 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 if (ch < maplen)
7232 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 if (x == 0xfffe) {
7235 /* undefined mapping */
7236 outpos = p-PyUnicode_AS_UNICODE(v);
7237 startinpos = s-starts;
7238 endinpos = startinpos+1;
7239 if (unicode_decode_call_errorhandler(
7240 errors, &errorHandler,
7241 "charmap", "character maps to <undefined>",
7242 &starts, &e, &startinpos, &endinpos, &exc, &s,
7243 &v, &outpos, &p)) {
7244 goto onError;
7245 }
7246 continue;
7247 }
7248 *p++ = x;
7249 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007250 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007251 }
7252 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 while (s < e) {
7254 unsigned char ch = *s;
7255 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007256
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7258 w = PyLong_FromLong((long)ch);
7259 if (w == NULL)
7260 goto onError;
7261 x = PyObject_GetItem(mapping, w);
7262 Py_DECREF(w);
7263 if (x == NULL) {
7264 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7265 /* No mapping found means: mapping is undefined. */
7266 PyErr_Clear();
7267 x = Py_None;
7268 Py_INCREF(x);
7269 } else
7270 goto onError;
7271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007272
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 /* Apply mapping */
7274 if (PyLong_Check(x)) {
7275 long value = PyLong_AS_LONG(x);
7276 if (value < 0 || value > 65535) {
7277 PyErr_SetString(PyExc_TypeError,
7278 "character mapping must be in range(65536)");
7279 Py_DECREF(x);
7280 goto onError;
7281 }
7282 *p++ = (Py_UNICODE)value;
7283 }
7284 else if (x == Py_None) {
7285 /* undefined mapping */
7286 outpos = p-PyUnicode_AS_UNICODE(v);
7287 startinpos = s-starts;
7288 endinpos = startinpos+1;
7289 if (unicode_decode_call_errorhandler(
7290 errors, &errorHandler,
7291 "charmap", "character maps to <undefined>",
7292 &starts, &e, &startinpos, &endinpos, &exc, &s,
7293 &v, &outpos, &p)) {
7294 Py_DECREF(x);
7295 goto onError;
7296 }
7297 Py_DECREF(x);
7298 continue;
7299 }
7300 else if (PyUnicode_Check(x)) {
7301 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007302
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 if (targetsize == 1)
7304 /* 1-1 mapping */
7305 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007306
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 else if (targetsize > 1) {
7308 /* 1-n mapping */
7309 if (targetsize > extrachars) {
7310 /* resize first */
7311 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7312 Py_ssize_t needed = (targetsize - extrachars) + \
7313 (targetsize << 2);
7314 extrachars += needed;
7315 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007316 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 PyUnicode_GET_SIZE(v) + needed) < 0) {
7318 Py_DECREF(x);
7319 goto onError;
7320 }
7321 p = PyUnicode_AS_UNICODE(v) + oldpos;
7322 }
7323 Py_UNICODE_COPY(p,
7324 PyUnicode_AS_UNICODE(x),
7325 targetsize);
7326 p += targetsize;
7327 extrachars -= targetsize;
7328 }
7329 /* 1-0 mapping: skip the character */
7330 }
7331 else {
7332 /* wrong return value */
7333 PyErr_SetString(PyExc_TypeError,
7334 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007335 Py_DECREF(x);
7336 goto onError;
7337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 Py_DECREF(x);
7339 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 }
7342 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007343 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007345 Py_XDECREF(errorHandler);
7346 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007347#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007348 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007349 Py_DECREF(v);
7350 return NULL;
7351 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007352#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007353 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007355
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 Py_XDECREF(errorHandler);
7358 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 Py_XDECREF(v);
7360 return NULL;
7361}
7362
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007363/* Charmap encoding: the lookup table */
7364
Alexander Belopolsky40018472011-02-26 01:02:56 +00007365struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 PyObject_HEAD
7367 unsigned char level1[32];
7368 int count2, count3;
7369 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007370};
7371
7372static PyObject*
7373encoding_map_size(PyObject *obj, PyObject* args)
7374{
7375 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007376 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007378}
7379
7380static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 PyDoc_STR("Return the size (in bytes) of this object") },
7383 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384};
7385
7386static void
7387encoding_map_dealloc(PyObject* o)
7388{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007389 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007390}
7391
7392static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007393 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 "EncodingMap", /*tp_name*/
7395 sizeof(struct encoding_map), /*tp_basicsize*/
7396 0, /*tp_itemsize*/
7397 /* methods */
7398 encoding_map_dealloc, /*tp_dealloc*/
7399 0, /*tp_print*/
7400 0, /*tp_getattr*/
7401 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007402 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 0, /*tp_repr*/
7404 0, /*tp_as_number*/
7405 0, /*tp_as_sequence*/
7406 0, /*tp_as_mapping*/
7407 0, /*tp_hash*/
7408 0, /*tp_call*/
7409 0, /*tp_str*/
7410 0, /*tp_getattro*/
7411 0, /*tp_setattro*/
7412 0, /*tp_as_buffer*/
7413 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7414 0, /*tp_doc*/
7415 0, /*tp_traverse*/
7416 0, /*tp_clear*/
7417 0, /*tp_richcompare*/
7418 0, /*tp_weaklistoffset*/
7419 0, /*tp_iter*/
7420 0, /*tp_iternext*/
7421 encoding_map_methods, /*tp_methods*/
7422 0, /*tp_members*/
7423 0, /*tp_getset*/
7424 0, /*tp_base*/
7425 0, /*tp_dict*/
7426 0, /*tp_descr_get*/
7427 0, /*tp_descr_set*/
7428 0, /*tp_dictoffset*/
7429 0, /*tp_init*/
7430 0, /*tp_alloc*/
7431 0, /*tp_new*/
7432 0, /*tp_free*/
7433 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007434};
7435
7436PyObject*
7437PyUnicode_BuildEncodingMap(PyObject* string)
7438{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007439 PyObject *result;
7440 struct encoding_map *mresult;
7441 int i;
7442 int need_dict = 0;
7443 unsigned char level1[32];
7444 unsigned char level2[512];
7445 unsigned char *mlevel1, *mlevel2, *mlevel3;
7446 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 int kind;
7448 void *data;
7449 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452 PyErr_BadArgument();
7453 return NULL;
7454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007455 kind = PyUnicode_KIND(string);
7456 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007457 memset(level1, 0xFF, sizeof level1);
7458 memset(level2, 0xFF, sizeof level2);
7459
7460 /* If there isn't a one-to-one mapping of NULL to \0,
7461 or if there are non-BMP characters, we need to use
7462 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007464 need_dict = 1;
7465 for (i = 1; i < 256; i++) {
7466 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007467 ch = PyUnicode_READ(kind, data, i);
7468 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007469 need_dict = 1;
7470 break;
7471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007473 /* unmapped character */
7474 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475 l1 = ch >> 11;
7476 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007477 if (level1[l1] == 0xFF)
7478 level1[l1] = count2++;
7479 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007481 }
7482
7483 if (count2 >= 0xFF || count3 >= 0xFF)
7484 need_dict = 1;
7485
7486 if (need_dict) {
7487 PyObject *result = PyDict_New();
7488 PyObject *key, *value;
7489 if (!result)
7490 return NULL;
7491 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007492 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007493 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007494 if (!key || !value)
7495 goto failed1;
7496 if (PyDict_SetItem(result, key, value) == -1)
7497 goto failed1;
7498 Py_DECREF(key);
7499 Py_DECREF(value);
7500 }
7501 return result;
7502 failed1:
7503 Py_XDECREF(key);
7504 Py_XDECREF(value);
7505 Py_DECREF(result);
7506 return NULL;
7507 }
7508
7509 /* Create a three-level trie */
7510 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7511 16*count2 + 128*count3 - 1);
7512 if (!result)
7513 return PyErr_NoMemory();
7514 PyObject_Init(result, &EncodingMapType);
7515 mresult = (struct encoding_map*)result;
7516 mresult->count2 = count2;
7517 mresult->count3 = count3;
7518 mlevel1 = mresult->level1;
7519 mlevel2 = mresult->level23;
7520 mlevel3 = mresult->level23 + 16*count2;
7521 memcpy(mlevel1, level1, 32);
7522 memset(mlevel2, 0xFF, 16*count2);
7523 memset(mlevel3, 0, 128*count3);
7524 count3 = 0;
7525 for (i = 1; i < 256; i++) {
7526 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007527 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007528 /* unmapped character */
7529 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007530 o1 = PyUnicode_READ(kind, data, i)>>11;
7531 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007532 i2 = 16*mlevel1[o1] + o2;
7533 if (mlevel2[i2] == 0xFF)
7534 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007535 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007536 i3 = 128*mlevel2[i2] + o3;
7537 mlevel3[i3] = i;
7538 }
7539 return result;
7540}
7541
7542static int
7543encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7544{
7545 struct encoding_map *map = (struct encoding_map*)mapping;
7546 int l1 = c>>11;
7547 int l2 = (c>>7) & 0xF;
7548 int l3 = c & 0x7F;
7549 int i;
7550
7551#ifdef Py_UNICODE_WIDE
7552 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007554 }
7555#endif
7556 if (c == 0)
7557 return 0;
7558 /* level 1*/
7559 i = map->level1[l1];
7560 if (i == 0xFF) {
7561 return -1;
7562 }
7563 /* level 2*/
7564 i = map->level23[16*i+l2];
7565 if (i == 0xFF) {
7566 return -1;
7567 }
7568 /* level 3 */
7569 i = map->level23[16*map->count2 + 128*i + l3];
7570 if (i == 0) {
7571 return -1;
7572 }
7573 return i;
7574}
7575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576/* Lookup the character ch in the mapping. If the character
7577 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007578 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007579static PyObject *
7580charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Christian Heimes217cfd12007-12-02 14:31:20 +00007582 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 PyObject *x;
7584
7585 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 x = PyObject_GetItem(mapping, w);
7588 Py_DECREF(w);
7589 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7591 /* No mapping found means: mapping is undefined. */
7592 PyErr_Clear();
7593 x = Py_None;
7594 Py_INCREF(x);
7595 return x;
7596 } else
7597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007599 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007601 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 long value = PyLong_AS_LONG(x);
7603 if (value < 0 || value > 255) {
7604 PyErr_SetString(PyExc_TypeError,
7605 "character mapping must be in range(256)");
7606 Py_DECREF(x);
7607 return NULL;
7608 }
7609 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007611 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 /* wrong return value */
7615 PyErr_Format(PyExc_TypeError,
7616 "character mapping must return integer, bytes or None, not %.400s",
7617 x->ob_type->tp_name);
7618 Py_DECREF(x);
7619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 }
7621}
7622
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007624charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7627 /* exponentially overallocate to minimize reallocations */
7628 if (requiredsize < 2*outsize)
7629 requiredsize = 2*outsize;
7630 if (_PyBytes_Resize(outobj, requiredsize))
7631 return -1;
7632 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007633}
7634
Benjamin Peterson14339b62009-01-31 16:36:08 +00007635typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007638/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007639 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 space is available. Return a new reference to the object that
7641 was put in the output buffer, or Py_None, if the mapping was undefined
7642 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007643 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007644static charmapencode_result
7645charmapencode_output(Py_UNICODE c, PyObject *mapping,
7646 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 PyObject *rep;
7649 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007650 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007651
Christian Heimes90aa7642007-12-19 02:45:37 +00007652 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007653 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007655 if (res == -1)
7656 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 if (outsize<requiredsize)
7658 if (charmapencode_resize(outobj, outpos, requiredsize))
7659 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007660 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 outstart[(*outpos)++] = (char)res;
7662 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007663 }
7664
7665 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007668 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 Py_DECREF(rep);
7670 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 if (PyLong_Check(rep)) {
7673 Py_ssize_t requiredsize = *outpos+1;
7674 if (outsize<requiredsize)
7675 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7676 Py_DECREF(rep);
7677 return enc_EXCEPTION;
7678 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007679 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007681 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 else {
7683 const char *repchars = PyBytes_AS_STRING(rep);
7684 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7685 Py_ssize_t requiredsize = *outpos+repsize;
7686 if (outsize<requiredsize)
7687 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7688 Py_DECREF(rep);
7689 return enc_EXCEPTION;
7690 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007691 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 memcpy(outstart + *outpos, repchars, repsize);
7693 *outpos += repsize;
7694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007696 Py_DECREF(rep);
7697 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698}
7699
7700/* handle an error in PyUnicode_EncodeCharmap
7701 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007702static int
7703charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007704 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007706 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007707 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708{
7709 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007710 Py_ssize_t repsize;
7711 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712 Py_UNICODE *uni2;
7713 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007714 Py_ssize_t collstartpos = *inpos;
7715 Py_ssize_t collendpos = *inpos+1;
7716 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007717 char *encoding = "charmap";
7718 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007720
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721 /* find all unencodable characters */
7722 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007724 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 int res = encoding_map_lookup(p[collendpos], mapping);
7726 if (res != -1)
7727 break;
7728 ++collendpos;
7729 continue;
7730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007731
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 rep = charmapencode_lookup(p[collendpos], mapping);
7733 if (rep==NULL)
7734 return -1;
7735 else if (rep!=Py_None) {
7736 Py_DECREF(rep);
7737 break;
7738 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 }
7742 /* cache callback name lookup
7743 * (if not done yet, i.e. it's the first error) */
7744 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 if ((errors==NULL) || (!strcmp(errors, "strict")))
7746 *known_errorHandler = 1;
7747 else if (!strcmp(errors, "replace"))
7748 *known_errorHandler = 2;
7749 else if (!strcmp(errors, "ignore"))
7750 *known_errorHandler = 3;
7751 else if (!strcmp(errors, "xmlcharrefreplace"))
7752 *known_errorHandler = 4;
7753 else
7754 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755 }
7756 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757 case 1: /* strict */
7758 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7759 return -1;
7760 case 2: /* replace */
7761 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 x = charmapencode_output('?', mapping, res, respos);
7763 if (x==enc_EXCEPTION) {
7764 return -1;
7765 }
7766 else if (x==enc_FAILED) {
7767 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7768 return -1;
7769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007770 }
7771 /* fall through */
7772 case 3: /* ignore */
7773 *inpos = collendpos;
7774 break;
7775 case 4: /* xmlcharrefreplace */
7776 /* generate replacement (temporarily (mis)uses p) */
7777 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 char buffer[2+29+1+1];
7779 char *cp;
7780 sprintf(buffer, "&#%d;", (int)p[collpos]);
7781 for (cp = buffer; *cp; ++cp) {
7782 x = charmapencode_output(*cp, mapping, res, respos);
7783 if (x==enc_EXCEPTION)
7784 return -1;
7785 else if (x==enc_FAILED) {
7786 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7787 return -1;
7788 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 }
7790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 *inpos = collendpos;
7792 break;
7793 default:
7794 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 encoding, reason, p, size, exceptionObject,
7796 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007799 if (PyBytes_Check(repunicode)) {
7800 /* Directly copy bytes result to output. */
7801 Py_ssize_t outsize = PyBytes_Size(*res);
7802 Py_ssize_t requiredsize;
7803 repsize = PyBytes_Size(repunicode);
7804 requiredsize = *respos + repsize;
7805 if (requiredsize > outsize)
7806 /* Make room for all additional bytes. */
7807 if (charmapencode_resize(res, respos, requiredsize)) {
7808 Py_DECREF(repunicode);
7809 return -1;
7810 }
7811 memcpy(PyBytes_AsString(*res) + *respos,
7812 PyBytes_AsString(repunicode), repsize);
7813 *respos += repsize;
7814 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007815 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007816 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007817 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 /* generate replacement */
7819 repsize = PyUnicode_GET_SIZE(repunicode);
7820 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 x = charmapencode_output(*uni2, mapping, res, respos);
7822 if (x==enc_EXCEPTION) {
7823 return -1;
7824 }
7825 else if (x==enc_FAILED) {
7826 Py_DECREF(repunicode);
7827 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7828 return -1;
7829 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 }
7831 *inpos = newpos;
7832 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 }
7834 return 0;
7835}
7836
Alexander Belopolsky40018472011-02-26 01:02:56 +00007837PyObject *
7838PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7839 Py_ssize_t size,
7840 PyObject *mapping,
7841 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 /* output object */
7844 PyObject *res = NULL;
7845 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007846 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007847 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 PyObject *errorHandler = NULL;
7850 PyObject *exc = NULL;
7851 /* the following variable is used for caching string comparisons
7852 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7853 * 3=ignore, 4=xmlcharrefreplace */
7854 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855
7856 /* Default to Latin-1 */
7857 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007860 /* allocate enough for a simple encoding without
7861 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007862 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 if (res == NULL)
7864 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007865 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 /* try to encode it */
7870 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7871 if (x==enc_EXCEPTION) /* error */
7872 goto onError;
7873 if (x==enc_FAILED) { /* unencodable character */
7874 if (charmap_encoding_error(p, size, &inpos, mapping,
7875 &exc,
7876 &known_errorHandler, &errorHandler, errors,
7877 &res, &respos)) {
7878 goto onError;
7879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 else
7882 /* done with this character => adjust input position */
7883 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007886 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007887 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007888 if (_PyBytes_Resize(&res, respos) < 0)
7889 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007890
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891 Py_XDECREF(exc);
7892 Py_XDECREF(errorHandler);
7893 return res;
7894
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 Py_XDECREF(res);
7897 Py_XDECREF(exc);
7898 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 return NULL;
7900}
7901
Alexander Belopolsky40018472011-02-26 01:02:56 +00007902PyObject *
7903PyUnicode_AsCharmapString(PyObject *unicode,
7904 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905{
7906 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 PyErr_BadArgument();
7908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 }
7910 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 PyUnicode_GET_SIZE(unicode),
7912 mapping,
7913 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914}
7915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007916/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007917static void
7918make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007920 Py_ssize_t startpos, Py_ssize_t endpos,
7921 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 *exceptionObject = _PyUnicodeTranslateError_Create(
7925 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 }
7927 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7929 goto onError;
7930 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7931 goto onError;
7932 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7933 goto onError;
7934 return;
7935 onError:
7936 Py_DECREF(*exceptionObject);
7937 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 }
7939}
7940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007942static void
7943raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007945 Py_ssize_t startpos, Py_ssize_t endpos,
7946 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947{
7948 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952}
7953
7954/* error handling callback helper:
7955 build arguments, call the callback and check the arguments,
7956 put the result into newpos and return the replacement string, which
7957 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007958static PyObject *
7959unicode_translate_call_errorhandler(const char *errors,
7960 PyObject **errorHandler,
7961 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007963 Py_ssize_t startpos, Py_ssize_t endpos,
7964 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007966 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007968 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 PyObject *restuple;
7970 PyObject *resunicode;
7971
7972 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 }
7977
7978 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982
7983 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007985 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007988 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 Py_DECREF(restuple);
7990 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 }
7992 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 &resunicode, &i_newpos)) {
7994 Py_DECREF(restuple);
7995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007997 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007998 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007999 else
8000 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8003 Py_DECREF(restuple);
8004 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008005 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 Py_INCREF(resunicode);
8007 Py_DECREF(restuple);
8008 return resunicode;
8009}
8010
8011/* Lookup the character ch in the mapping and put the result in result,
8012 which must be decrefed by the caller.
8013 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016{
Christian Heimes217cfd12007-12-02 14:31:20 +00008017 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018 PyObject *x;
8019
8020 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 x = PyObject_GetItem(mapping, w);
8023 Py_DECREF(w);
8024 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8026 /* No mapping found means: use 1:1 mapping. */
8027 PyErr_Clear();
8028 *result = NULL;
8029 return 0;
8030 } else
8031 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032 }
8033 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 *result = x;
8035 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008037 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 long value = PyLong_AS_LONG(x);
8039 long max = PyUnicode_GetMax();
8040 if (value < 0 || value > max) {
8041 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008042 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 Py_DECREF(x);
8044 return -1;
8045 }
8046 *result = x;
8047 return 0;
8048 }
8049 else if (PyUnicode_Check(x)) {
8050 *result = x;
8051 return 0;
8052 }
8053 else {
8054 /* wrong return value */
8055 PyErr_SetString(PyExc_TypeError,
8056 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 Py_DECREF(x);
8058 return -1;
8059 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060}
8061/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 if not reallocate and adjust various state variables.
8063 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008064static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008068 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008069 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 /* exponentially overallocate to minimize reallocations */
8071 if (requiredsize < 2 * oldsize)
8072 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008073 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8074 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077 }
8078 return 0;
8079}
8080/* lookup the character, put the result in the output string and adjust
8081 various state variables. Return a new reference to the object that
8082 was put in the output buffer in *result, or Py_None, if the mapping was
8083 undefined (in which case no character was written).
8084 The called must decref result.
8085 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008087charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8088 PyObject *mapping, Py_UCS4 **output,
8089 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8093 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 }
8099 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008101 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 Py_ssize_t repsize;
8107 if (PyUnicode_READY(*res) == -1)
8108 return -1;
8109 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 if (repsize==1) {
8111 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 }
8114 else if (repsize!=0) {
8115 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 Py_ssize_t requiredsize = *opos +
8117 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 Py_ssize_t i;
8120 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 for(i = 0; i < repsize; i++)
8123 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125 }
8126 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128 return 0;
8129}
8130
Alexander Belopolsky40018472011-02-26 01:02:56 +00008131PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132_PyUnicode_TranslateCharmap(PyObject *input,
8133 PyObject *mapping,
8134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 /* input object */
8137 char *idata;
8138 Py_ssize_t size, i;
8139 int kind;
8140 /* output buffer */
8141 Py_UCS4 *output = NULL;
8142 Py_ssize_t osize;
8143 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 char *reason = "character maps to <undefined>";
8147 PyObject *errorHandler = NULL;
8148 PyObject *exc = NULL;
8149 /* the following variable is used for caching string comparisons
8150 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8151 * 3=ignore, 4=xmlcharrefreplace */
8152 int known_errorHandler = -1;
8153
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 PyErr_BadArgument();
8156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 if (PyUnicode_READY(input) == -1)
8160 return NULL;
8161 idata = (char*)PyUnicode_DATA(input);
8162 kind = PyUnicode_KIND(input);
8163 size = PyUnicode_GET_LENGTH(input);
8164 i = 0;
8165
8166 if (size == 0) {
8167 Py_INCREF(input);
8168 return input;
8169 }
8170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008171 /* allocate enough for a simple 1:1 translation without
8172 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 osize = size;
8174 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8175 opos = 0;
8176 if (output == NULL) {
8177 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 /* try to encode it */
8183 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 if (charmaptranslate_output(input, i, mapping,
8185 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 Py_XDECREF(x);
8187 goto onError;
8188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 else { /* untranslatable character */
8193 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8194 Py_ssize_t repsize;
8195 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 Py_ssize_t collstart = i;
8199 Py_ssize_t collend = i+1;
8200 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 while (collend < size) {
8204 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 goto onError;
8206 Py_XDECREF(x);
8207 if (x!=Py_None)
8208 break;
8209 ++collend;
8210 }
8211 /* cache callback name lookup
8212 * (if not done yet, i.e. it's the first error) */
8213 if (known_errorHandler==-1) {
8214 if ((errors==NULL) || (!strcmp(errors, "strict")))
8215 known_errorHandler = 1;
8216 else if (!strcmp(errors, "replace"))
8217 known_errorHandler = 2;
8218 else if (!strcmp(errors, "ignore"))
8219 known_errorHandler = 3;
8220 else if (!strcmp(errors, "xmlcharrefreplace"))
8221 known_errorHandler = 4;
8222 else
8223 known_errorHandler = 0;
8224 }
8225 switch (known_errorHandler) {
8226 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 raise_translate_exception(&exc, input, collstart,
8228 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 case 2: /* replace */
8231 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 for (coll = collstart; coll<collend; coll++)
8233 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 /* fall through */
8235 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 break;
8238 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 /* generate replacement (temporarily (mis)uses i) */
8240 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 char buffer[2+29+1+1];
8242 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8244 if (charmaptranslate_makespace(&output, &osize,
8245 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 goto onError;
8247 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 break;
8252 default:
8253 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 reason, input, &exc,
8255 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008256 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 goto onError;
8258 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 repsize = PyUnicode_GET_LENGTH(repunicode);
8260 if (charmaptranslate_makespace(&output, &osize,
8261 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(repunicode);
8263 goto onError;
8264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 for (uni2 = 0; repsize-->0; ++uni2)
8266 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8267 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 }
8271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8273 if (!res)
8274 goto onError;
8275 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 Py_XDECREF(exc);
8277 Py_XDECREF(errorHandler);
8278 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 Py_XDECREF(exc);
8283 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 return NULL;
8285}
8286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287/* Deprecated. Use PyUnicode_Translate instead. */
8288PyObject *
8289PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8290 Py_ssize_t size,
8291 PyObject *mapping,
8292 const char *errors)
8293{
8294 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8295 if (!unicode)
8296 return NULL;
8297 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8298}
8299
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300PyObject *
8301PyUnicode_Translate(PyObject *str,
8302 PyObject *mapping,
8303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304{
8305 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008306
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 str = PyUnicode_FromObject(str);
8308 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 Py_DECREF(str);
8312 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008313
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 Py_XDECREF(str);
8316 return NULL;
8317}
Tim Petersced69f82003-09-16 20:30:58 +00008318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008320fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321{
8322 /* No need to call PyUnicode_READY(self) because this function is only
8323 called as a callback from fixup() which does it already. */
8324 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8325 const int kind = PyUnicode_KIND(self);
8326 void *data = PyUnicode_DATA(self);
8327 Py_UCS4 maxchar = 0, ch, fixed;
8328 Py_ssize_t i;
8329
8330 for (i = 0; i < len; ++i) {
8331 ch = PyUnicode_READ(kind, data, i);
8332 fixed = 0;
8333 if (ch > 127) {
8334 if (Py_UNICODE_ISSPACE(ch))
8335 fixed = ' ';
8336 else {
8337 const int decimal = Py_UNICODE_TODECIMAL(ch);
8338 if (decimal >= 0)
8339 fixed = '0' + decimal;
8340 }
8341 if (fixed != 0) {
8342 if (fixed > maxchar)
8343 maxchar = fixed;
8344 PyUnicode_WRITE(kind, data, i, fixed);
8345 }
8346 else if (ch > maxchar)
8347 maxchar = ch;
8348 }
8349 else if (ch > maxchar)
8350 maxchar = ch;
8351 }
8352
8353 return maxchar;
8354}
8355
8356PyObject *
8357_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8358{
8359 if (!PyUnicode_Check(unicode)) {
8360 PyErr_BadInternalCall();
8361 return NULL;
8362 }
8363 if (PyUnicode_READY(unicode) == -1)
8364 return NULL;
8365 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8366 /* If the string is already ASCII, just return the same string */
8367 Py_INCREF(unicode);
8368 return unicode;
8369 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008370 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371}
8372
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008373PyObject *
8374PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8375 Py_ssize_t length)
8376{
8377 PyObject *result;
8378 Py_UNICODE *p; /* write pointer into result */
8379 Py_ssize_t i;
8380 /* Copy to a new string */
8381 result = (PyObject *)_PyUnicode_New(length);
8382 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8383 if (result == NULL)
8384 return result;
8385 p = PyUnicode_AS_UNICODE(result);
8386 /* Iterate over code points */
8387 for (i = 0; i < length; i++) {
8388 Py_UNICODE ch =s[i];
8389 if (ch > 127) {
8390 int decimal = Py_UNICODE_TODECIMAL(ch);
8391 if (decimal >= 0)
8392 p[i] = '0' + decimal;
8393 }
8394 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008395#ifndef DONT_MAKE_RESULT_READY
8396 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 Py_DECREF(result);
8398 return NULL;
8399 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008400#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008401 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008402 return result;
8403}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008404/* --- Decimal Encoder ---------------------------------------------------- */
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406int
8407PyUnicode_EncodeDecimal(Py_UNICODE *s,
8408 Py_ssize_t length,
8409 char *output,
8410 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008411{
8412 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 PyObject *errorHandler = NULL;
8414 PyObject *exc = NULL;
8415 const char *encoding = "decimal";
8416 const char *reason = "invalid decimal Unicode string";
8417 /* the following variable is used for caching string comparisons
8418 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8419 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008420
8421 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 PyErr_BadArgument();
8423 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008424 }
8425
8426 p = s;
8427 end = s + length;
8428 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 register Py_UNICODE ch = *p;
8430 int decimal;
8431 PyObject *repunicode;
8432 Py_ssize_t repsize;
8433 Py_ssize_t newpos;
8434 Py_UNICODE *uni2;
8435 Py_UNICODE *collstart;
8436 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008437
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 ++p;
8441 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 decimal = Py_UNICODE_TODECIMAL(ch);
8444 if (decimal >= 0) {
8445 *output++ = '0' + decimal;
8446 ++p;
8447 continue;
8448 }
8449 if (0 < ch && ch < 256) {
8450 *output++ = (char)ch;
8451 ++p;
8452 continue;
8453 }
8454 /* All other characters are considered unencodable */
8455 collstart = p;
8456 collend = p+1;
8457 while (collend < end) {
8458 if ((0 < *collend && *collend < 256) ||
8459 !Py_UNICODE_ISSPACE(*collend) ||
8460 Py_UNICODE_TODECIMAL(*collend))
8461 break;
8462 }
8463 /* cache callback name lookup
8464 * (if not done yet, i.e. it's the first error) */
8465 if (known_errorHandler==-1) {
8466 if ((errors==NULL) || (!strcmp(errors, "strict")))
8467 known_errorHandler = 1;
8468 else if (!strcmp(errors, "replace"))
8469 known_errorHandler = 2;
8470 else if (!strcmp(errors, "ignore"))
8471 known_errorHandler = 3;
8472 else if (!strcmp(errors, "xmlcharrefreplace"))
8473 known_errorHandler = 4;
8474 else
8475 known_errorHandler = 0;
8476 }
8477 switch (known_errorHandler) {
8478 case 1: /* strict */
8479 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8480 goto onError;
8481 case 2: /* replace */
8482 for (p = collstart; p < collend; ++p)
8483 *output++ = '?';
8484 /* fall through */
8485 case 3: /* ignore */
8486 p = collend;
8487 break;
8488 case 4: /* xmlcharrefreplace */
8489 /* generate replacement (temporarily (mis)uses p) */
8490 for (p = collstart; p < collend; ++p)
8491 output += sprintf(output, "&#%d;", (int)*p);
8492 p = collend;
8493 break;
8494 default:
8495 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8496 encoding, reason, s, length, &exc,
8497 collstart-s, collend-s, &newpos);
8498 if (repunicode == NULL)
8499 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008501 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8503 Py_DECREF(repunicode);
8504 goto onError;
8505 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 /* generate replacement */
8507 repsize = PyUnicode_GET_SIZE(repunicode);
8508 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8509 Py_UNICODE ch = *uni2;
8510 if (Py_UNICODE_ISSPACE(ch))
8511 *output++ = ' ';
8512 else {
8513 decimal = Py_UNICODE_TODECIMAL(ch);
8514 if (decimal >= 0)
8515 *output++ = '0' + decimal;
8516 else if (0 < ch && ch < 256)
8517 *output++ = (char)ch;
8518 else {
8519 Py_DECREF(repunicode);
8520 raise_encode_exception(&exc, encoding,
8521 s, length, collstart-s, collend-s, reason);
8522 goto onError;
8523 }
8524 }
8525 }
8526 p = s + newpos;
8527 Py_DECREF(repunicode);
8528 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008529 }
8530 /* 0-terminate the output string */
8531 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 Py_XDECREF(exc);
8533 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008534 return 0;
8535
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 Py_XDECREF(exc);
8538 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008539 return -1;
8540}
8541
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542/* --- Helpers ------------------------------------------------------------ */
8543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008545any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 Py_ssize_t start,
8547 Py_ssize_t end)
8548{
8549 int kind1, kind2, kind;
8550 void *buf1, *buf2;
8551 Py_ssize_t len1, len2, result;
8552
8553 kind1 = PyUnicode_KIND(s1);
8554 kind2 = PyUnicode_KIND(s2);
8555 kind = kind1 > kind2 ? kind1 : kind2;
8556 buf1 = PyUnicode_DATA(s1);
8557 buf2 = PyUnicode_DATA(s2);
8558 if (kind1 != kind)
8559 buf1 = _PyUnicode_AsKind(s1, kind);
8560 if (!buf1)
8561 return -2;
8562 if (kind2 != kind)
8563 buf2 = _PyUnicode_AsKind(s2, kind);
8564 if (!buf2) {
8565 if (kind1 != kind) PyMem_Free(buf1);
8566 return -2;
8567 }
8568 len1 = PyUnicode_GET_LENGTH(s1);
8569 len2 = PyUnicode_GET_LENGTH(s2);
8570
Victor Stinner794d5672011-10-10 03:21:36 +02008571 if (direction > 0) {
8572 switch(kind) {
8573 case PyUnicode_1BYTE_KIND:
8574 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8575 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8576 else
8577 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8578 break;
8579 case PyUnicode_2BYTE_KIND:
8580 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8581 break;
8582 case PyUnicode_4BYTE_KIND:
8583 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8584 break;
8585 default:
8586 assert(0); result = -2;
8587 }
8588 }
8589 else {
8590 switch(kind) {
8591 case PyUnicode_1BYTE_KIND:
8592 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8593 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8594 else
8595 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8596 break;
8597 case PyUnicode_2BYTE_KIND:
8598 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8599 break;
8600 case PyUnicode_4BYTE_KIND:
8601 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8602 break;
8603 default:
8604 assert(0); result = -2;
8605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 }
8607
8608 if (kind1 != kind)
8609 PyMem_Free(buf1);
8610 if (kind2 != kind)
8611 PyMem_Free(buf2);
8612
8613 return result;
8614}
8615
8616Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008617_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 Py_ssize_t n_buffer,
8619 void *digits, Py_ssize_t n_digits,
8620 Py_ssize_t min_width,
8621 const char *grouping,
8622 const char *thousands_sep)
8623{
8624 switch(kind) {
8625 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008626 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8627 return _PyUnicode_ascii_InsertThousandsGrouping(
8628 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8629 min_width, grouping, thousands_sep);
8630 else
8631 return _PyUnicode_ucs1_InsertThousandsGrouping(
8632 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8633 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 case PyUnicode_2BYTE_KIND:
8635 return _PyUnicode_ucs2_InsertThousandsGrouping(
8636 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8637 min_width, grouping, thousands_sep);
8638 case PyUnicode_4BYTE_KIND:
8639 return _PyUnicode_ucs4_InsertThousandsGrouping(
8640 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8641 min_width, grouping, thousands_sep);
8642 }
8643 assert(0);
8644 return -1;
8645}
8646
8647
Thomas Wouters477c8d52006-05-27 19:21:47 +00008648/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008649#define ADJUST_INDICES(start, end, len) \
8650 if (end > len) \
8651 end = len; \
8652 else if (end < 0) { \
8653 end += len; \
8654 if (end < 0) \
8655 end = 0; \
8656 } \
8657 if (start < 0) { \
8658 start += len; \
8659 if (start < 0) \
8660 start = 0; \
8661 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008662
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663Py_ssize_t
8664PyUnicode_Count(PyObject *str,
8665 PyObject *substr,
8666 Py_ssize_t start,
8667 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008669 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008670 PyUnicodeObject* str_obj;
8671 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 int kind1, kind2, kind;
8673 void *buf1 = NULL, *buf2 = NULL;
8674 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008675
Thomas Wouters477c8d52006-05-27 19:21:47 +00008676 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008679 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008680 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(str_obj);
8682 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 }
Tim Petersced69f82003-09-16 20:30:58 +00008684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 kind1 = PyUnicode_KIND(str_obj);
8686 kind2 = PyUnicode_KIND(sub_obj);
8687 kind = kind1 > kind2 ? kind1 : kind2;
8688 buf1 = PyUnicode_DATA(str_obj);
8689 if (kind1 != kind)
8690 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8691 if (!buf1)
8692 goto onError;
8693 buf2 = PyUnicode_DATA(sub_obj);
8694 if (kind2 != kind)
8695 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8696 if (!buf2)
8697 goto onError;
8698 len1 = PyUnicode_GET_LENGTH(str_obj);
8699 len2 = PyUnicode_GET_LENGTH(sub_obj);
8700
8701 ADJUST_INDICES(start, end, len1);
8702 switch(kind) {
8703 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008704 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8705 result = asciilib_count(
8706 ((Py_UCS1*)buf1) + start, end - start,
8707 buf2, len2, PY_SSIZE_T_MAX
8708 );
8709 else
8710 result = ucs1lib_count(
8711 ((Py_UCS1*)buf1) + start, end - start,
8712 buf2, len2, PY_SSIZE_T_MAX
8713 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 break;
8715 case PyUnicode_2BYTE_KIND:
8716 result = ucs2lib_count(
8717 ((Py_UCS2*)buf1) + start, end - start,
8718 buf2, len2, PY_SSIZE_T_MAX
8719 );
8720 break;
8721 case PyUnicode_4BYTE_KIND:
8722 result = ucs4lib_count(
8723 ((Py_UCS4*)buf1) + start, end - start,
8724 buf2, len2, PY_SSIZE_T_MAX
8725 );
8726 break;
8727 default:
8728 assert(0); result = 0;
8729 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008730
8731 Py_DECREF(sub_obj);
8732 Py_DECREF(str_obj);
8733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 if (kind1 != kind)
8735 PyMem_Free(buf1);
8736 if (kind2 != kind)
8737 PyMem_Free(buf2);
8738
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 onError:
8741 Py_DECREF(sub_obj);
8742 Py_DECREF(str_obj);
8743 if (kind1 != kind && buf1)
8744 PyMem_Free(buf1);
8745 if (kind2 != kind && buf2)
8746 PyMem_Free(buf2);
8747 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748}
8749
Alexander Belopolsky40018472011-02-26 01:02:56 +00008750Py_ssize_t
8751PyUnicode_Find(PyObject *str,
8752 PyObject *sub,
8753 Py_ssize_t start,
8754 Py_ssize_t end,
8755 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008757 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008758
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008762 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 Py_DECREF(str);
8765 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 }
Tim Petersced69f82003-09-16 20:30:58 +00008767
Victor Stinner794d5672011-10-10 03:21:36 +02008768 result = any_find_slice(direction,
8769 str, sub, start, end
8770 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008771
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008773 Py_DECREF(sub);
8774
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 return result;
8776}
8777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778Py_ssize_t
8779PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8780 Py_ssize_t start, Py_ssize_t end,
8781 int direction)
8782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008784 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 if (PyUnicode_READY(str) == -1)
8786 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008787 if (start < 0 || end < 0) {
8788 PyErr_SetString(PyExc_IndexError, "string index out of range");
8789 return -2;
8790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 if (end > PyUnicode_GET_LENGTH(str))
8792 end = PyUnicode_GET_LENGTH(str);
8793 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008794 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8795 kind, end-start, ch, direction);
8796 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008798 else
8799 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800}
8801
Alexander Belopolsky40018472011-02-26 01:02:56 +00008802static int
8803tailmatch(PyUnicodeObject *self,
8804 PyUnicodeObject *substring,
8805 Py_ssize_t start,
8806 Py_ssize_t end,
8807 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 int kind_self;
8810 int kind_sub;
8811 void *data_self;
8812 void *data_sub;
8813 Py_ssize_t offset;
8814 Py_ssize_t i;
8815 Py_ssize_t end_sub;
8816
8817 if (PyUnicode_READY(self) == -1 ||
8818 PyUnicode_READY(substring) == -1)
8819 return 0;
8820
8821 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 return 1;
8823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8825 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 kind_self = PyUnicode_KIND(self);
8830 data_self = PyUnicode_DATA(self);
8831 kind_sub = PyUnicode_KIND(substring);
8832 data_sub = PyUnicode_DATA(substring);
8833 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8834
8835 if (direction > 0)
8836 offset = end;
8837 else
8838 offset = start;
8839
8840 if (PyUnicode_READ(kind_self, data_self, offset) ==
8841 PyUnicode_READ(kind_sub, data_sub, 0) &&
8842 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8843 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8844 /* If both are of the same kind, memcmp is sufficient */
8845 if (kind_self == kind_sub) {
8846 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008847 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 data_sub,
8849 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008850 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 }
8852 /* otherwise we have to compare each character by first accesing it */
8853 else {
8854 /* We do not need to compare 0 and len(substring)-1 because
8855 the if statement above ensured already that they are equal
8856 when we end up here. */
8857 // TODO: honor direction and do a forward or backwards search
8858 for (i = 1; i < end_sub; ++i) {
8859 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8860 PyUnicode_READ(kind_sub, data_sub, i))
8861 return 0;
8862 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 }
8866
8867 return 0;
8868}
8869
Alexander Belopolsky40018472011-02-26 01:02:56 +00008870Py_ssize_t
8871PyUnicode_Tailmatch(PyObject *str,
8872 PyObject *substr,
8873 Py_ssize_t start,
8874 Py_ssize_t end,
8875 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008878
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 str = PyUnicode_FromObject(str);
8880 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 substr = PyUnicode_FromObject(substr);
8883 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(str);
8885 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 }
Tim Petersced69f82003-09-16 20:30:58 +00008887
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 (PyUnicodeObject *)substr,
8890 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 Py_DECREF(str);
8892 Py_DECREF(substr);
8893 return result;
8894}
8895
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896/* Apply fixfct filter to the Unicode object self and return a
8897 reference to the modified object */
8898
Alexander Belopolsky40018472011-02-26 01:02:56 +00008899static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008900fixup(PyObject *self,
8901 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 PyObject *u;
8904 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 if (PyUnicode_READY(self) == -1)
8907 return NULL;
8908 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8909 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8910 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008915 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 /* fix functions return the new maximum character in a string,
8918 if the kind of the resulting unicode object does not change,
8919 everything is fine. Otherwise we need to change the string kind
8920 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008921 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 if (maxchar_new == 0)
8923 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8924 else if (maxchar_new <= 127)
8925 maxchar_new = 127;
8926 else if (maxchar_new <= 255)
8927 maxchar_new = 255;
8928 else if (maxchar_new <= 65535)
8929 maxchar_new = 65535;
8930 else
8931 maxchar_new = 1114111; /* 0x10ffff */
8932
8933 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 /* fixfct should return TRUE if it modified the buffer. If
8935 FALSE, return a reference to the original buffer instead
8936 (to save space, not time) */
8937 Py_INCREF(self);
8938 Py_DECREF(u);
8939 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 else if (maxchar_new == maxchar_old) {
8942 return u;
8943 }
8944 else {
8945 /* In case the maximum character changed, we need to
8946 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008947 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 if (v == NULL) {
8949 Py_DECREF(u);
8950 return NULL;
8951 }
8952 if (maxchar_new > maxchar_old) {
8953 /* If the maxchar increased so that the kind changed, not all
8954 characters are representable anymore and we need to fix the
8955 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008956 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008957 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8959 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008960 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008961 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963
8964 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008965 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 return v;
8967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968}
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008971fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 /* No need to call PyUnicode_READY(self) because this function is only
8974 called as a callback from fixup() which does it already. */
8975 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8976 const int kind = PyUnicode_KIND(self);
8977 void *data = PyUnicode_DATA(self);
8978 int touched = 0;
8979 Py_UCS4 maxchar = 0;
8980 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 for (i = 0; i < len; ++i) {
8983 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8984 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8985 if (up != ch) {
8986 if (up > maxchar)
8987 maxchar = up;
8988 PyUnicode_WRITE(kind, data, i, up);
8989 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 else if (ch > maxchar)
8992 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
8994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (touched)
8996 return maxchar;
8997 else
8998 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999}
9000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009002fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9005 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9006 const int kind = PyUnicode_KIND(self);
9007 void *data = PyUnicode_DATA(self);
9008 int touched = 0;
9009 Py_UCS4 maxchar = 0;
9010 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 for(i = 0; i < len; ++i) {
9013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9014 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9015 if (lo != ch) {
9016 if (lo > maxchar)
9017 maxchar = lo;
9018 PyUnicode_WRITE(kind, data, i, lo);
9019 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 else if (ch > maxchar)
9022 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 }
9024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 if (touched)
9026 return maxchar;
9027 else
9028 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029}
9030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009032fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9035 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9036 const int kind = PyUnicode_KIND(self);
9037 void *data = PyUnicode_DATA(self);
9038 int touched = 0;
9039 Py_UCS4 maxchar = 0;
9040 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 for(i = 0; i < len; ++i) {
9043 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9044 Py_UCS4 nu = 0;
9045
9046 if (Py_UNICODE_ISUPPER(ch))
9047 nu = Py_UNICODE_TOLOWER(ch);
9048 else if (Py_UNICODE_ISLOWER(ch))
9049 nu = Py_UNICODE_TOUPPER(ch);
9050
9051 if (nu != 0) {
9052 if (nu > maxchar)
9053 maxchar = nu;
9054 PyUnicode_WRITE(kind, data, i, nu);
9055 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 else if (ch > maxchar)
9058 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 }
9060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (touched)
9062 return maxchar;
9063 else
9064 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
9066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009068fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9071 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9072 const int kind = PyUnicode_KIND(self);
9073 void *data = PyUnicode_DATA(self);
9074 int touched = 0;
9075 Py_UCS4 maxchar = 0;
9076 Py_ssize_t i = 0;
9077 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009078
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009079 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081
9082 ch = PyUnicode_READ(kind, data, i);
9083 if (!Py_UNICODE_ISUPPER(ch)) {
9084 maxchar = Py_UNICODE_TOUPPER(ch);
9085 PyUnicode_WRITE(kind, data, i, maxchar);
9086 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 ++i;
9089 for(; i < len; ++i) {
9090 ch = PyUnicode_READ(kind, data, i);
9091 if (!Py_UNICODE_ISLOWER(ch)) {
9092 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9093 if (lo > maxchar)
9094 maxchar = lo;
9095 PyUnicode_WRITE(kind, data, i, lo);
9096 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 else if (ch > maxchar)
9099 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101
9102 if (touched)
9103 return maxchar;
9104 else
9105 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
9107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009109fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9112 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9113 const int kind = PyUnicode_KIND(self);
9114 void *data = PyUnicode_DATA(self);
9115 Py_UCS4 maxchar = 0;
9116 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 int previous_is_cased;
9118
9119 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 if (len == 1) {
9121 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9122 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9123 if (ti != ch) {
9124 PyUnicode_WRITE(kind, data, i, ti);
9125 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 }
9127 else
9128 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 for(; i < len; ++i) {
9132 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9133 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009134
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 nu = Py_UNICODE_TOTITLE(ch);
9139
9140 if (nu > maxchar)
9141 maxchar = nu;
9142 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009143
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 if (Py_UNICODE_ISLOWER(ch) ||
9145 Py_UNICODE_ISUPPER(ch) ||
9146 Py_UNICODE_ISTITLE(ch))
9147 previous_is_cased = 1;
9148 else
9149 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152}
9153
Tim Peters8ce9f162004-08-27 01:49:32 +00009154PyObject *
9155PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009158 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009160 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009161 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9162 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009163 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009165 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009167 int use_memcpy;
9168 unsigned char *res_data = NULL, *sep_data = NULL;
9169 PyObject *last_obj;
9170 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171
Tim Peters05eba1f2004-08-27 21:32:02 +00009172 fseq = PySequence_Fast(seq, "");
9173 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009175 }
9176
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009177 /* NOTE: the following code can't call back into Python code,
9178 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009179 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009180
Tim Peters05eba1f2004-08-27 21:32:02 +00009181 seqlen = PySequence_Fast_GET_SIZE(fseq);
9182 /* If empty sequence, return u"". */
9183 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009184 Py_DECREF(fseq);
9185 Py_INCREF(unicode_empty);
9186 res = unicode_empty;
9187 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009188 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009189
Tim Peters05eba1f2004-08-27 21:32:02 +00009190 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009191 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009192 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009193 if (seqlen == 1) {
9194 if (PyUnicode_CheckExact(items[0])) {
9195 res = items[0];
9196 Py_INCREF(res);
9197 Py_DECREF(fseq);
9198 return res;
9199 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009200 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009201 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009202 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009203 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009204 /* Set up sep and seplen */
9205 if (separator == NULL) {
9206 /* fall back to a blank space separator */
9207 sep = PyUnicode_FromOrdinal(' ');
9208 if (!sep)
9209 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009210 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009211 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009212 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009213 else {
9214 if (!PyUnicode_Check(separator)) {
9215 PyErr_Format(PyExc_TypeError,
9216 "separator: expected str instance,"
9217 " %.80s found",
9218 Py_TYPE(separator)->tp_name);
9219 goto onError;
9220 }
9221 if (PyUnicode_READY(separator))
9222 goto onError;
9223 sep = separator;
9224 seplen = PyUnicode_GET_LENGTH(separator);
9225 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9226 /* inc refcount to keep this code path symmetric with the
9227 above case of a blank separator */
9228 Py_INCREF(sep);
9229 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009230 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009231 }
9232
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009233 /* There are at least two things to join, or else we have a subclass
9234 * of str in the sequence.
9235 * Do a pre-pass to figure out the total amount of space we'll
9236 * need (sz), and see whether all argument are strings.
9237 */
9238 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009239#ifdef Py_DEBUG
9240 use_memcpy = 0;
9241#else
9242 use_memcpy = 1;
9243#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009244 for (i = 0; i < seqlen; i++) {
9245 const Py_ssize_t old_sz = sz;
9246 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 if (!PyUnicode_Check(item)) {
9248 PyErr_Format(PyExc_TypeError,
9249 "sequence item %zd: expected str instance,"
9250 " %.80s found",
9251 i, Py_TYPE(item)->tp_name);
9252 goto onError;
9253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 if (PyUnicode_READY(item) == -1)
9255 goto onError;
9256 sz += PyUnicode_GET_LENGTH(item);
9257 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009258 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009259 if (i != 0)
9260 sz += seplen;
9261 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9262 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009264 goto onError;
9265 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009266 if (use_memcpy && last_obj != NULL) {
9267 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9268 use_memcpy = 0;
9269 }
9270 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009271 }
Tim Petersced69f82003-09-16 20:30:58 +00009272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009274 if (res == NULL)
9275 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009276
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009277 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009278#ifdef Py_DEBUG
9279 use_memcpy = 0;
9280#else
9281 if (use_memcpy) {
9282 res_data = PyUnicode_1BYTE_DATA(res);
9283 kind = PyUnicode_KIND(res);
9284 if (seplen != 0)
9285 sep_data = PyUnicode_1BYTE_DATA(sep);
9286 }
9287#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009289 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009290 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009292 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009293 if (use_memcpy) {
9294 Py_MEMCPY(res_data,
9295 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009296 kind * seplen);
9297 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009298 }
9299 else {
9300 copy_characters(res, res_offset, sep, 0, seplen);
9301 res_offset += seplen;
9302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009304 itemlen = PyUnicode_GET_LENGTH(item);
9305 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009306 if (use_memcpy) {
9307 Py_MEMCPY(res_data,
9308 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 kind * itemlen);
9310 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009311 }
9312 else {
9313 copy_characters(res, res_offset, item, 0, itemlen);
9314 res_offset += itemlen;
9315 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009316 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009317 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009318 if (use_memcpy)
9319 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009320 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009321 else
9322 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009323
Tim Peters05eba1f2004-08-27 21:32:02 +00009324 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009326 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009330 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009332 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 return NULL;
9334}
9335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336#define FILL(kind, data, value, start, length) \
9337 do { \
9338 Py_ssize_t i_ = 0; \
9339 assert(kind != PyUnicode_WCHAR_KIND); \
9340 switch ((kind)) { \
9341 case PyUnicode_1BYTE_KIND: { \
9342 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9343 memset(to_, (unsigned char)value, length); \
9344 break; \
9345 } \
9346 case PyUnicode_2BYTE_KIND: { \
9347 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9348 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9349 break; \
9350 } \
9351 default: { \
9352 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9353 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9354 break; \
9355 } \
9356 } \
9357 } while (0)
9358
Victor Stinner9310abb2011-10-05 00:59:23 +02009359static PyObject *
9360pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361 Py_ssize_t left,
9362 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 PyObject *u;
9366 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009367 int kind;
9368 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369
9370 if (left < 0)
9371 left = 0;
9372 if (right < 0)
9373 right = 0;
9374
Tim Peters7a29bd52001-09-12 03:03:31 +00009375 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 Py_INCREF(self);
9377 return self;
9378 }
9379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9381 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009382 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9383 return NULL;
9384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9386 if (fill > maxchar)
9387 maxchar = fill;
9388 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009389 if (!u)
9390 return NULL;
9391
9392 kind = PyUnicode_KIND(u);
9393 data = PyUnicode_DATA(u);
9394 if (left)
9395 FILL(kind, data, fill, 0, left);
9396 if (right)
9397 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009398 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009399 assert(_PyUnicode_CheckConsistency(u, 1));
9400 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404PyObject *
9405PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408
9409 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 switch(PyUnicode_KIND(string)) {
9414 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009415 if (PyUnicode_IS_ASCII(string))
9416 list = asciilib_splitlines(
9417 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9418 PyUnicode_GET_LENGTH(string), keepends);
9419 else
9420 list = ucs1lib_splitlines(
9421 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9422 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 break;
9424 case PyUnicode_2BYTE_KIND:
9425 list = ucs2lib_splitlines(
9426 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9427 PyUnicode_GET_LENGTH(string), keepends);
9428 break;
9429 case PyUnicode_4BYTE_KIND:
9430 list = ucs4lib_splitlines(
9431 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9432 PyUnicode_GET_LENGTH(string), keepends);
9433 break;
9434 default:
9435 assert(0);
9436 list = 0;
9437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 Py_DECREF(string);
9439 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440}
9441
Alexander Belopolsky40018472011-02-26 01:02:56 +00009442static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009443split(PyObject *self,
9444 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009445 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 int kind1, kind2, kind;
9448 void *buf1, *buf2;
9449 Py_ssize_t len1, len2;
9450 PyObject* out;
9451
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009453 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (PyUnicode_READY(self) == -1)
9456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (substring == NULL)
9459 switch(PyUnicode_KIND(self)) {
9460 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009461 if (PyUnicode_IS_ASCII(self))
9462 return asciilib_split_whitespace(
9463 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9464 PyUnicode_GET_LENGTH(self), maxcount
9465 );
9466 else
9467 return ucs1lib_split_whitespace(
9468 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9469 PyUnicode_GET_LENGTH(self), maxcount
9470 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 case PyUnicode_2BYTE_KIND:
9472 return ucs2lib_split_whitespace(
9473 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9474 PyUnicode_GET_LENGTH(self), maxcount
9475 );
9476 case PyUnicode_4BYTE_KIND:
9477 return ucs4lib_split_whitespace(
9478 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9479 PyUnicode_GET_LENGTH(self), maxcount
9480 );
9481 default:
9482 assert(0);
9483 return NULL;
9484 }
9485
9486 if (PyUnicode_READY(substring) == -1)
9487 return NULL;
9488
9489 kind1 = PyUnicode_KIND(self);
9490 kind2 = PyUnicode_KIND(substring);
9491 kind = kind1 > kind2 ? kind1 : kind2;
9492 buf1 = PyUnicode_DATA(self);
9493 buf2 = PyUnicode_DATA(substring);
9494 if (kind1 != kind)
9495 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9496 if (!buf1)
9497 return NULL;
9498 if (kind2 != kind)
9499 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9500 if (!buf2) {
9501 if (kind1 != kind) PyMem_Free(buf1);
9502 return NULL;
9503 }
9504 len1 = PyUnicode_GET_LENGTH(self);
9505 len2 = PyUnicode_GET_LENGTH(substring);
9506
9507 switch(kind) {
9508 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009509 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9510 out = asciilib_split(
9511 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9512 else
9513 out = ucs1lib_split(
9514 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 break;
9516 case PyUnicode_2BYTE_KIND:
9517 out = ucs2lib_split(
9518 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9519 break;
9520 case PyUnicode_4BYTE_KIND:
9521 out = ucs4lib_split(
9522 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9523 break;
9524 default:
9525 out = NULL;
9526 }
9527 if (kind1 != kind)
9528 PyMem_Free(buf1);
9529 if (kind2 != kind)
9530 PyMem_Free(buf2);
9531 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532}
9533
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009535rsplit(PyObject *self,
9536 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009537 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 int kind1, kind2, kind;
9540 void *buf1, *buf2;
9541 Py_ssize_t len1, len2;
9542 PyObject* out;
9543
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009544 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009545 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 if (PyUnicode_READY(self) == -1)
9548 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 if (substring == NULL)
9551 switch(PyUnicode_KIND(self)) {
9552 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009553 if (PyUnicode_IS_ASCII(self))
9554 return asciilib_rsplit_whitespace(
9555 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9556 PyUnicode_GET_LENGTH(self), maxcount
9557 );
9558 else
9559 return ucs1lib_rsplit_whitespace(
9560 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9561 PyUnicode_GET_LENGTH(self), maxcount
9562 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 case PyUnicode_2BYTE_KIND:
9564 return ucs2lib_rsplit_whitespace(
9565 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9566 PyUnicode_GET_LENGTH(self), maxcount
9567 );
9568 case PyUnicode_4BYTE_KIND:
9569 return ucs4lib_rsplit_whitespace(
9570 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9571 PyUnicode_GET_LENGTH(self), maxcount
9572 );
9573 default:
9574 assert(0);
9575 return NULL;
9576 }
9577
9578 if (PyUnicode_READY(substring) == -1)
9579 return NULL;
9580
9581 kind1 = PyUnicode_KIND(self);
9582 kind2 = PyUnicode_KIND(substring);
9583 kind = kind1 > kind2 ? kind1 : kind2;
9584 buf1 = PyUnicode_DATA(self);
9585 buf2 = PyUnicode_DATA(substring);
9586 if (kind1 != kind)
9587 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9588 if (!buf1)
9589 return NULL;
9590 if (kind2 != kind)
9591 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9592 if (!buf2) {
9593 if (kind1 != kind) PyMem_Free(buf1);
9594 return NULL;
9595 }
9596 len1 = PyUnicode_GET_LENGTH(self);
9597 len2 = PyUnicode_GET_LENGTH(substring);
9598
9599 switch(kind) {
9600 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009601 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9602 out = asciilib_rsplit(
9603 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9604 else
9605 out = ucs1lib_rsplit(
9606 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 break;
9608 case PyUnicode_2BYTE_KIND:
9609 out = ucs2lib_rsplit(
9610 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9611 break;
9612 case PyUnicode_4BYTE_KIND:
9613 out = ucs4lib_rsplit(
9614 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9615 break;
9616 default:
9617 out = NULL;
9618 }
9619 if (kind1 != kind)
9620 PyMem_Free(buf1);
9621 if (kind2 != kind)
9622 PyMem_Free(buf2);
9623 return out;
9624}
9625
9626static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009627anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9628 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629{
9630 switch(kind) {
9631 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009632 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9633 return asciilib_find(buf1, len1, buf2, len2, offset);
9634 else
9635 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 case PyUnicode_2BYTE_KIND:
9637 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9638 case PyUnicode_4BYTE_KIND:
9639 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9640 }
9641 assert(0);
9642 return -1;
9643}
9644
9645static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009646anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9647 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648{
9649 switch(kind) {
9650 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009651 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9652 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9653 else
9654 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 case PyUnicode_2BYTE_KIND:
9656 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9657 case PyUnicode_4BYTE_KIND:
9658 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9659 }
9660 assert(0);
9661 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009662}
9663
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665replace(PyObject *self, PyObject *str1,
9666 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 PyObject *u;
9669 char *sbuf = PyUnicode_DATA(self);
9670 char *buf1 = PyUnicode_DATA(str1);
9671 char *buf2 = PyUnicode_DATA(str2);
9672 int srelease = 0, release1 = 0, release2 = 0;
9673 int skind = PyUnicode_KIND(self);
9674 int kind1 = PyUnicode_KIND(str1);
9675 int kind2 = PyUnicode_KIND(str2);
9676 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9677 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9678 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009679 int mayshrink;
9680 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681
9682 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009685 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
Victor Stinner59de0ee2011-10-07 10:01:28 +02009687 if (str1 == str2)
9688 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 if (skind < kind1)
9690 /* substring too wide to be present */
9691 goto nothing;
9692
Victor Stinner49a0a212011-10-12 23:46:10 +02009693 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9694 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9695 /* Replacing str1 with str2 may cause a maxchar reduction in the
9696 result string. */
9697 mayshrink = (maxchar_str2 < maxchar);
9698 maxchar = Py_MAX(maxchar, maxchar_str2);
9699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009701 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009702 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009704 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009707 Py_UCS4 u1, u2;
9708 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009710 if (findchar(sbuf, PyUnicode_KIND(self),
9711 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009712 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009717 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 rkind = PyUnicode_KIND(u);
9719 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9720 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009721 if (--maxcount < 0)
9722 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009725 }
9726 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 int rkind = skind;
9728 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 if (kind1 < rkind) {
9731 /* widen substring */
9732 buf1 = _PyUnicode_AsKind(str1, rkind);
9733 if (!buf1) goto error;
9734 release1 = 1;
9735 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009736 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009737 if (i < 0)
9738 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (rkind > kind2) {
9740 /* widen replacement */
9741 buf2 = _PyUnicode_AsKind(str2, rkind);
9742 if (!buf2) goto error;
9743 release2 = 1;
9744 }
9745 else if (rkind < kind2) {
9746 /* widen self and buf1 */
9747 rkind = kind2;
9748 if (release1) PyMem_Free(buf1);
9749 sbuf = _PyUnicode_AsKind(self, rkind);
9750 if (!sbuf) goto error;
9751 srelease = 1;
9752 buf1 = _PyUnicode_AsKind(str1, rkind);
9753 if (!buf1) goto error;
9754 release1 = 1;
9755 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009756 u = PyUnicode_New(slen, maxchar);
9757 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009759 assert(PyUnicode_KIND(u) == rkind);
9760 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009761
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009762 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009763 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009764 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009768
9769 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009771 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009773 if (i == -1)
9774 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009775 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009781 }
9782 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_ssize_t n, i, j, ires;
9784 Py_ssize_t product, new_size;
9785 int rkind = skind;
9786 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009789 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 buf1 = _PyUnicode_AsKind(str1, rkind);
9791 if (!buf1) goto error;
9792 release1 = 1;
9793 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009794 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009795 if (n == 0)
9796 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009798 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 buf2 = _PyUnicode_AsKind(str2, rkind);
9800 if (!buf2) goto error;
9801 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009804 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 rkind = kind2;
9806 sbuf = _PyUnicode_AsKind(self, rkind);
9807 if (!sbuf) goto error;
9808 srelease = 1;
9809 if (release1) PyMem_Free(buf1);
9810 buf1 = _PyUnicode_AsKind(str1, rkind);
9811 if (!buf1) goto error;
9812 release1 = 1;
9813 }
9814 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9815 PyUnicode_GET_LENGTH(str1))); */
9816 product = n * (len2-len1);
9817 if ((product / (len2-len1)) != n) {
9818 PyErr_SetString(PyExc_OverflowError,
9819 "replace string is too long");
9820 goto error;
9821 }
9822 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +02009823 if (new_size == 0) {
9824 Py_INCREF(unicode_empty);
9825 u = unicode_empty;
9826 goto done;
9827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9829 PyErr_SetString(PyExc_OverflowError,
9830 "replace string is too long");
9831 goto error;
9832 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009833 u = PyUnicode_New(new_size, maxchar);
9834 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009836 assert(PyUnicode_KIND(u) == rkind);
9837 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 ires = i = 0;
9839 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840 while (n-- > 0) {
9841 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009842 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009843 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009845 if (j == -1)
9846 break;
9847 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009848 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009849 memcpy(res + rkind * ires,
9850 sbuf + rkind * i,
9851 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009853 }
9854 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009856 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009858 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009864 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009865 memcpy(res + rkind * ires,
9866 sbuf + rkind * i,
9867 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +02009868 }
9869 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009870 /* interleave */
9871 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009872 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009874 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009876 if (--n <= 0)
9877 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009878 memcpy(res + rkind * ires,
9879 sbuf + rkind * i,
9880 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 ires++;
9882 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009883 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009884 memcpy(res + rkind * ires,
9885 sbuf + rkind * i,
9886 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009887 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009888 }
9889
9890 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009891 unicode_adjust_maxchar(&u);
9892 if (u == NULL)
9893 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009895
9896 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (srelease)
9898 PyMem_FREE(sbuf);
9899 if (release1)
9900 PyMem_FREE(buf1);
9901 if (release2)
9902 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009903 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009905
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (srelease)
9909 PyMem_FREE(sbuf);
9910 if (release1)
9911 PyMem_FREE(buf1);
9912 if (release2)
9913 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009914 if (PyUnicode_CheckExact(self)) {
9915 Py_INCREF(self);
9916 return (PyObject *) self;
9917 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009918 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 error:
9920 if (srelease && sbuf)
9921 PyMem_FREE(sbuf);
9922 if (release1 && buf1)
9923 PyMem_FREE(buf1);
9924 if (release2 && buf2)
9925 PyMem_FREE(buf2);
9926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927}
9928
9929/* --- Unicode Object Methods --------------------------------------------- */
9930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009931PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933\n\
9934Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009935characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
9937static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009938unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 return fixup(self, fixtitle);
9941}
9942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009943PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945\n\
9946Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009947have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
9949static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009950unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 return fixup(self, fixcapitalize);
9953}
9954
9955#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009956PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958\n\
9959Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009960normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961
9962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009963unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
9965 PyObject *list;
9966 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009967 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969 /* Split into words */
9970 list = split(self, NULL, -1);
9971 if (!list)
9972 return NULL;
9973
9974 /* Capitalize each word */
9975 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9976 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 if (item == NULL)
9979 goto onError;
9980 Py_DECREF(PyList_GET_ITEM(list, i));
9981 PyList_SET_ITEM(list, i, item);
9982 }
9983
9984 /* Join the words to form a new string */
9985 item = PyUnicode_Join(NULL, list);
9986
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988 Py_DECREF(list);
9989 return (PyObject *)item;
9990}
9991#endif
9992
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009993/* Argument converter. Coerces to a single unicode character */
9994
9995static int
9996convert_uc(PyObject *obj, void *addr)
9997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009999 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010000
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 uniobj = PyUnicode_FromObject(obj);
10002 if (uniobj == NULL) {
10003 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010005 return 0;
10006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 Py_DECREF(uniobj);
10011 return 0;
10012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010014 Py_DECREF(uniobj);
10015 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010016}
10017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010018PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010019 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010021Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010022done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
10024static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010025unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010027 Py_ssize_t marg, left;
10028 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 Py_UCS4 fillchar = ' ';
10030
Victor Stinnere9a29352011-10-01 02:14:59 +020010031 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Victor Stinnere9a29352011-10-01 02:14:59 +020010034 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 return NULL;
10036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 Py_INCREF(self);
10039 return (PyObject*) self;
10040 }
10041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 left = marg / 2 + (marg & width & 1);
10044
Victor Stinner9310abb2011-10-05 00:59:23 +020010045 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046}
10047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048/* This function assumes that str1 and str2 are readied by the caller. */
10049
Marc-André Lemburge5034372000-08-08 08:04:29 +000010050static int
10051unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 int kind1, kind2;
10054 void *data1, *data2;
10055 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 kind1 = PyUnicode_KIND(str1);
10058 kind2 = PyUnicode_KIND(str2);
10059 data1 = PyUnicode_DATA(str1);
10060 data2 = PyUnicode_DATA(str2);
10061 len1 = PyUnicode_GET_LENGTH(str1);
10062 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 for (i = 0; i < len1 && i < len2; ++i) {
10065 Py_UCS4 c1, c2;
10066 c1 = PyUnicode_READ(kind1, data1, i);
10067 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010068
10069 if (c1 != c2)
10070 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010071 }
10072
10073 return (len1 < len2) ? -1 : (len1 != len2);
10074}
10075
Alexander Belopolsky40018472011-02-26 01:02:56 +000010076int
10077PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10080 if (PyUnicode_READY(left) == -1 ||
10081 PyUnicode_READY(right) == -1)
10082 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010083 return unicode_compare((PyUnicodeObject *)left,
10084 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010086 PyErr_Format(PyExc_TypeError,
10087 "Can't compare %.100s and %.100s",
10088 left->ob_type->tp_name,
10089 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090 return -1;
10091}
10092
Martin v. Löwis5b222132007-06-10 09:51:05 +000010093int
10094PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 Py_ssize_t i;
10097 int kind;
10098 void *data;
10099 Py_UCS4 chr;
10100
Victor Stinner910337b2011-10-03 03:20:16 +020010101 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (PyUnicode_READY(uni) == -1)
10103 return -1;
10104 kind = PyUnicode_KIND(uni);
10105 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010106 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10108 if (chr != str[i])
10109 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010110 /* This check keeps Python strings that end in '\0' from comparing equal
10111 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010114 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010116 return 0;
10117}
10118
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010119
Benjamin Peterson29060642009-01-31 22:14:21 +000010120#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010122
Alexander Belopolsky40018472011-02-26 01:02:56 +000010123PyObject *
10124PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010125{
10126 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010127
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010128 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10129 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (PyUnicode_READY(left) == -1 ||
10131 PyUnicode_READY(right) == -1)
10132 return NULL;
10133 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10134 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010135 if (op == Py_EQ) {
10136 Py_INCREF(Py_False);
10137 return Py_False;
10138 }
10139 if (op == Py_NE) {
10140 Py_INCREF(Py_True);
10141 return Py_True;
10142 }
10143 }
10144 if (left == right)
10145 result = 0;
10146 else
10147 result = unicode_compare((PyUnicodeObject *)left,
10148 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010149
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010150 /* Convert the return value to a Boolean */
10151 switch (op) {
10152 case Py_EQ:
10153 v = TEST_COND(result == 0);
10154 break;
10155 case Py_NE:
10156 v = TEST_COND(result != 0);
10157 break;
10158 case Py_LE:
10159 v = TEST_COND(result <= 0);
10160 break;
10161 case Py_GE:
10162 v = TEST_COND(result >= 0);
10163 break;
10164 case Py_LT:
10165 v = TEST_COND(result == -1);
10166 break;
10167 case Py_GT:
10168 v = TEST_COND(result == 1);
10169 break;
10170 default:
10171 PyErr_BadArgument();
10172 return NULL;
10173 }
10174 Py_INCREF(v);
10175 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177
Brian Curtindfc80e32011-08-10 20:28:54 -050010178 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010179}
10180
Alexander Belopolsky40018472011-02-26 01:02:56 +000010181int
10182PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010183{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 int kind1, kind2, kind;
10186 void *buf1, *buf2;
10187 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010188 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010189
10190 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 sub = PyUnicode_FromObject(element);
10192 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 PyErr_Format(PyExc_TypeError,
10194 "'in <string>' requires string as left operand, not %s",
10195 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010196 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (PyUnicode_READY(sub) == -1)
10199 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010200
Thomas Wouters477c8d52006-05-27 19:21:47 +000010201 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010202 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010203 Py_DECREF(sub);
10204 return -1;
10205 }
10206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 kind1 = PyUnicode_KIND(str);
10208 kind2 = PyUnicode_KIND(sub);
10209 kind = kind1 > kind2 ? kind1 : kind2;
10210 buf1 = PyUnicode_DATA(str);
10211 buf2 = PyUnicode_DATA(sub);
10212 if (kind1 != kind)
10213 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10214 if (!buf1) {
10215 Py_DECREF(sub);
10216 return -1;
10217 }
10218 if (kind2 != kind)
10219 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10220 if (!buf2) {
10221 Py_DECREF(sub);
10222 if (kind1 != kind) PyMem_Free(buf1);
10223 return -1;
10224 }
10225 len1 = PyUnicode_GET_LENGTH(str);
10226 len2 = PyUnicode_GET_LENGTH(sub);
10227
10228 switch(kind) {
10229 case PyUnicode_1BYTE_KIND:
10230 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10231 break;
10232 case PyUnicode_2BYTE_KIND:
10233 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10234 break;
10235 case PyUnicode_4BYTE_KIND:
10236 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10237 break;
10238 default:
10239 result = -1;
10240 assert(0);
10241 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242
10243 Py_DECREF(str);
10244 Py_DECREF(sub);
10245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (kind1 != kind)
10247 PyMem_Free(buf1);
10248 if (kind2 != kind)
10249 PyMem_Free(buf2);
10250
Guido van Rossum403d68b2000-03-13 15:55:09 +000010251 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010252}
10253
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254/* Concat to string or Unicode object giving a new Unicode object. */
10255
Alexander Belopolsky40018472011-02-26 01:02:56 +000010256PyObject *
10257PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010260 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
10262 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
10270 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010271 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010275 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278 }
10279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010281 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10282 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 w = PyUnicode_New(
10286 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10287 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010290 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10291 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 Py_DECREF(u);
10293 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010294 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 Py_XDECREF(u);
10299 Py_XDECREF(v);
10300 return NULL;
10301}
10302
Victor Stinnerb0923652011-10-04 01:17:31 +020010303static void
10304unicode_append_inplace(PyObject **p_left, PyObject *right)
10305{
10306 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010307
10308 assert(PyUnicode_IS_READY(*p_left));
10309 assert(PyUnicode_IS_READY(right));
10310
10311 left_len = PyUnicode_GET_LENGTH(*p_left);
10312 right_len = PyUnicode_GET_LENGTH(right);
10313 if (left_len > PY_SSIZE_T_MAX - right_len) {
10314 PyErr_SetString(PyExc_OverflowError,
10315 "strings are too large to concat");
10316 goto error;
10317 }
10318 new_len = left_len + right_len;
10319
10320 /* Now we own the last reference to 'left', so we can resize it
10321 * in-place.
10322 */
10323 if (unicode_resize(p_left, new_len) != 0) {
10324 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10325 * deallocated so it cannot be put back into
10326 * 'variable'. The MemoryError is raised when there
10327 * is no value in 'variable', which might (very
10328 * remotely) be a cause of incompatibilities.
10329 */
10330 goto error;
10331 }
10332 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010333 copy_characters(*p_left, left_len, right, 0, right_len);
10334 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010335 return;
10336
10337error:
10338 Py_DECREF(*p_left);
10339 *p_left = NULL;
10340}
10341
Walter Dörwald1ab83302007-05-18 17:15:44 +000010342void
Victor Stinner23e56682011-10-03 03:54:37 +020010343PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010344{
Victor Stinner23e56682011-10-03 03:54:37 +020010345 PyObject *left, *res;
10346
10347 if (p_left == NULL) {
10348 if (!PyErr_Occurred())
10349 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010350 return;
10351 }
Victor Stinner23e56682011-10-03 03:54:37 +020010352 left = *p_left;
10353 if (right == NULL || !PyUnicode_Check(left)) {
10354 if (!PyErr_Occurred())
10355 PyErr_BadInternalCall();
10356 goto error;
10357 }
10358
Victor Stinnere1335c72011-10-04 20:53:03 +020010359 if (PyUnicode_READY(left))
10360 goto error;
10361 if (PyUnicode_READY(right))
10362 goto error;
10363
Victor Stinner23e56682011-10-03 03:54:37 +020010364 if (PyUnicode_CheckExact(left) && left != unicode_empty
10365 && PyUnicode_CheckExact(right) && right != unicode_empty
10366 && unicode_resizable(left)
10367 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10368 || _PyUnicode_WSTR(left) != NULL))
10369 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010370 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10371 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010372 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010373 not so different than duplicating the string. */
10374 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010375 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010376 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010377 if (p_left != NULL)
10378 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010379 return;
10380 }
10381 }
10382
10383 res = PyUnicode_Concat(left, right);
10384 if (res == NULL)
10385 goto error;
10386 Py_DECREF(left);
10387 *p_left = res;
10388 return;
10389
10390error:
10391 Py_DECREF(*p_left);
10392 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010393}
10394
10395void
10396PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10397{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010398 PyUnicode_Append(pleft, right);
10399 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010400}
10401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010402PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010405Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010406string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010407interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408
10409static PyObject *
10410unicode_count(PyUnicodeObject *self, PyObject *args)
10411{
10412 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010413 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010414 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 int kind1, kind2, kind;
10417 void *buf1, *buf2;
10418 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419
Jesus Ceaac451502011-04-20 17:09:23 +020010420 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10421 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 kind1 = PyUnicode_KIND(self);
10425 kind2 = PyUnicode_KIND(substring);
10426 kind = kind1 > kind2 ? kind1 : kind2;
10427 buf1 = PyUnicode_DATA(self);
10428 buf2 = PyUnicode_DATA(substring);
10429 if (kind1 != kind)
10430 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10431 if (!buf1) {
10432 Py_DECREF(substring);
10433 return NULL;
10434 }
10435 if (kind2 != kind)
10436 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10437 if (!buf2) {
10438 Py_DECREF(substring);
10439 if (kind1 != kind) PyMem_Free(buf1);
10440 return NULL;
10441 }
10442 len1 = PyUnicode_GET_LENGTH(self);
10443 len2 = PyUnicode_GET_LENGTH(substring);
10444
10445 ADJUST_INDICES(start, end, len1);
10446 switch(kind) {
10447 case PyUnicode_1BYTE_KIND:
10448 iresult = ucs1lib_count(
10449 ((Py_UCS1*)buf1) + start, end - start,
10450 buf2, len2, PY_SSIZE_T_MAX
10451 );
10452 break;
10453 case PyUnicode_2BYTE_KIND:
10454 iresult = ucs2lib_count(
10455 ((Py_UCS2*)buf1) + start, end - start,
10456 buf2, len2, PY_SSIZE_T_MAX
10457 );
10458 break;
10459 case PyUnicode_4BYTE_KIND:
10460 iresult = ucs4lib_count(
10461 ((Py_UCS4*)buf1) + start, end - start,
10462 buf2, len2, PY_SSIZE_T_MAX
10463 );
10464 break;
10465 default:
10466 assert(0); iresult = 0;
10467 }
10468
10469 result = PyLong_FromSsize_t(iresult);
10470
10471 if (kind1 != kind)
10472 PyMem_Free(buf1);
10473 if (kind2 != kind)
10474 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478 return result;
10479}
10480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010481PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010482 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010484Encode S using the codec registered for encoding. Default encoding\n\
10485is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010486handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010487a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10488'xmlcharrefreplace' as well as any other name registered with\n\
10489codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490
10491static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010492unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010494 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 char *encoding = NULL;
10496 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010497
Benjamin Peterson308d6372009-09-18 21:42:35 +000010498 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10499 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010501 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010502}
10503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010504PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506\n\
10507Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010508If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
10510static PyObject*
10511unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10512{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010513 Py_ssize_t i, j, line_pos, src_len, incr;
10514 Py_UCS4 ch;
10515 PyObject *u;
10516 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010518 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010519 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
Antoine Pitrou22425222011-10-04 19:10:51 +020010524 if (PyUnicode_READY(self) == -1)
10525 return NULL;
10526
Thomas Wouters7e474022000-07-16 12:04:32 +000010527 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010528 src_len = PyUnicode_GET_LENGTH(self);
10529 i = j = line_pos = 0;
10530 kind = PyUnicode_KIND(self);
10531 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010532 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010533 for (; i < src_len; i++) {
10534 ch = PyUnicode_READ(kind, src_data, i);
10535 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010536 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010538 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010539 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010540 goto overflow;
10541 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010542 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010543 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010547 goto overflow;
10548 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010550 if (ch == '\n' || ch == '\r')
10551 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010553 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010554 if (!found && PyUnicode_CheckExact(self)) {
10555 Py_INCREF((PyObject *) self);
10556 return (PyObject *) self;
10557 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010558
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010560 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 if (!u)
10562 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010563 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564
Antoine Pitroue71d5742011-10-04 15:55:09 +020010565 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
Antoine Pitroue71d5742011-10-04 15:55:09 +020010567 for (; i < src_len; i++) {
10568 ch = PyUnicode_READ(kind, src_data, i);
10569 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010571 incr = tabsize - (line_pos % tabsize);
10572 line_pos += incr;
10573 while (incr--) {
10574 PyUnicode_WRITE(kind, dest_data, j, ' ');
10575 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010576 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010580 line_pos++;
10581 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010582 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010583 if (ch == '\n' || ch == '\r')
10584 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010586 }
10587 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010588#ifndef DONT_MAKE_RESULT_READY
10589 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 Py_DECREF(u);
10591 return NULL;
10592 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010593#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010594 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010596
Antoine Pitroue71d5742011-10-04 15:55:09 +020010597 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010598 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600}
10601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010602PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010603 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604\n\
10605Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010606such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607arguments start and end are interpreted as in slice notation.\n\
10608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010609Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610
10611static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613{
Jesus Ceaac451502011-04-20 17:09:23 +020010614 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010615 Py_ssize_t start;
10616 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618
Jesus Ceaac451502011-04-20 17:09:23 +020010619 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10620 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (PyUnicode_READY(self) == -1)
10624 return NULL;
10625 if (PyUnicode_READY(substring) == -1)
10626 return NULL;
10627
Victor Stinner794d5672011-10-10 03:21:36 +020010628 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631
10632 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 if (result == -2)
10635 return NULL;
10636
Christian Heimes217cfd12007-12-02 14:31:20 +000010637 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638}
10639
10640static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010641unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010643 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10644 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647}
10648
Guido van Rossumc2504932007-09-18 19:42:40 +000010649/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010650 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010651static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010652unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653{
Guido van Rossumc2504932007-09-18 19:42:40 +000010654 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010655 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (_PyUnicode_HASH(self) != -1)
10658 return _PyUnicode_HASH(self);
10659 if (PyUnicode_READY(self) == -1)
10660 return -1;
10661 len = PyUnicode_GET_LENGTH(self);
10662
10663 /* The hash function as a macro, gets expanded three times below. */
10664#define HASH(P) \
10665 x = (Py_uhash_t)*P << 7; \
10666 while (--len >= 0) \
10667 x = (1000003*x) ^ (Py_uhash_t)*P++;
10668
10669 switch (PyUnicode_KIND(self)) {
10670 case PyUnicode_1BYTE_KIND: {
10671 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10672 HASH(c);
10673 break;
10674 }
10675 case PyUnicode_2BYTE_KIND: {
10676 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10677 HASH(s);
10678 break;
10679 }
10680 default: {
10681 Py_UCS4 *l;
10682 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10683 "Impossible switch case in unicode_hash");
10684 l = PyUnicode_4BYTE_DATA(self);
10685 HASH(l);
10686 break;
10687 }
10688 }
10689 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10690
Guido van Rossumc2504932007-09-18 19:42:40 +000010691 if (x == -1)
10692 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010694 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010698PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
10703static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010706 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010707 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010708 Py_ssize_t start;
10709 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
Jesus Ceaac451502011-04-20 17:09:23 +020010711 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10712 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (PyUnicode_READY(self) == -1)
10716 return NULL;
10717 if (PyUnicode_READY(substring) == -1)
10718 return NULL;
10719
Victor Stinner794d5672011-10-10 03:21:36 +020010720 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
10724 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (result == -2)
10727 return NULL;
10728
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 if (result < 0) {
10730 PyErr_SetString(PyExc_ValueError, "substring not found");
10731 return NULL;
10732 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010733
Christian Heimes217cfd12007-12-02 14:31:20 +000010734 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735}
10736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010737PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010738 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010740Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010741at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742
10743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010744unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 Py_ssize_t i, length;
10747 int kind;
10748 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 int cased;
10750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 if (PyUnicode_READY(self) == -1)
10752 return NULL;
10753 length = PyUnicode_GET_LENGTH(self);
10754 kind = PyUnicode_KIND(self);
10755 data = PyUnicode_DATA(self);
10756
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 if (length == 1)
10759 return PyBool_FromLong(
10760 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010762 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010765
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 for (i = 0; i < length; i++) {
10768 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010769
Benjamin Peterson29060642009-01-31 22:14:21 +000010770 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10771 return PyBool_FromLong(0);
10772 else if (!cased && Py_UNICODE_ISLOWER(ch))
10773 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010775 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776}
10777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010778PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010781Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010782at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
10784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010785unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 Py_ssize_t i, length;
10788 int kind;
10789 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790 int cased;
10791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 if (PyUnicode_READY(self) == -1)
10793 return NULL;
10794 length = PyUnicode_GET_LENGTH(self);
10795 kind = PyUnicode_KIND(self);
10796 data = PyUnicode_DATA(self);
10797
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (length == 1)
10800 return PyBool_FromLong(
10801 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010803 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010806
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 for (i = 0; i < length; i++) {
10809 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010810
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10812 return PyBool_FromLong(0);
10813 else if (!cased && Py_UNICODE_ISUPPER(ch))
10814 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010816 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817}
10818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010819PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010822Return True if S is a titlecased string and there is at least one\n\
10823character in S, i.e. upper- and titlecase characters may only\n\
10824follow uncased characters and lowercase characters only cased ones.\n\
10825Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
10827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010828unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 Py_ssize_t i, length;
10831 int kind;
10832 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 int cased, previous_is_cased;
10834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (PyUnicode_READY(self) == -1)
10836 return NULL;
10837 length = PyUnicode_GET_LENGTH(self);
10838 kind = PyUnicode_KIND(self);
10839 data = PyUnicode_DATA(self);
10840
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (length == 1) {
10843 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10844 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10845 (Py_UNICODE_ISUPPER(ch) != 0));
10846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010848 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010851
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 cased = 0;
10853 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 for (i = 0; i < length; i++) {
10855 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010856
Benjamin Peterson29060642009-01-31 22:14:21 +000010857 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10858 if (previous_is_cased)
10859 return PyBool_FromLong(0);
10860 previous_is_cased = 1;
10861 cased = 1;
10862 }
10863 else if (Py_UNICODE_ISLOWER(ch)) {
10864 if (!previous_is_cased)
10865 return PyBool_FromLong(0);
10866 previous_is_cased = 1;
10867 cased = 1;
10868 }
10869 else
10870 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010872 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873}
10874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010875PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010878Return True if all characters in S are whitespace\n\
10879and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
10881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010882unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 Py_ssize_t i, length;
10885 int kind;
10886 void *data;
10887
10888 if (PyUnicode_READY(self) == -1)
10889 return NULL;
10890 length = PyUnicode_GET_LENGTH(self);
10891 kind = PyUnicode_KIND(self);
10892 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (length == 1)
10896 return PyBool_FromLong(
10897 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010899 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 for (i = 0; i < length; i++) {
10904 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010905 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010908 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909}
10910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010911PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010913\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010914Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010915and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010916
10917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010918unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 Py_ssize_t i, length;
10921 int kind;
10922 void *data;
10923
10924 if (PyUnicode_READY(self) == -1)
10925 return NULL;
10926 length = PyUnicode_GET_LENGTH(self);
10927 kind = PyUnicode_KIND(self);
10928 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010929
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010930 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (length == 1)
10932 return PyBool_FromLong(
10933 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010934
10935 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010937 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 for (i = 0; i < length; i++) {
10940 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010943 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010944}
10945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010948\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010949Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010951
10952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010953unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010954{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 int kind;
10956 void *data;
10957 Py_ssize_t len, i;
10958
10959 if (PyUnicode_READY(self) == -1)
10960 return NULL;
10961
10962 kind = PyUnicode_KIND(self);
10963 data = PyUnicode_DATA(self);
10964 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010965
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010966 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (len == 1) {
10968 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10969 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10970 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010971
10972 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 for (i = 0; i < len; i++) {
10977 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010978 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010980 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010981 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010982}
10983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010984PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010985 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010987Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010988False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
10990static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010991unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 Py_ssize_t i, length;
10994 int kind;
10995 void *data;
10996
10997 if (PyUnicode_READY(self) == -1)
10998 return NULL;
10999 length = PyUnicode_GET_LENGTH(self);
11000 kind = PyUnicode_KIND(self);
11001 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (length == 1)
11005 return PyBool_FromLong(
11006 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011008 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 for (i = 0; i < length; i++) {
11013 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011016 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017}
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011022Return True if all characters in S are digits\n\
11023and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011026unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 Py_ssize_t i, length;
11029 int kind;
11030 void *data;
11031
11032 if (PyUnicode_READY(self) == -1)
11033 return NULL;
11034 length = PyUnicode_GET_LENGTH(self);
11035 kind = PyUnicode_KIND(self);
11036 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 if (length == 1) {
11040 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11041 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 for (i = 0; i < length; i++) {
11049 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053}
11054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011058Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
11061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011062unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 Py_ssize_t i, length;
11065 int kind;
11066 void *data;
11067
11068 if (PyUnicode_READY(self) == -1)
11069 return NULL;
11070 length = PyUnicode_GET_LENGTH(self);
11071 kind = PyUnicode_KIND(self);
11072 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 if (length == 1)
11076 return PyBool_FromLong(
11077 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011079 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011081 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 for (i = 0; i < length; i++) {
11084 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011087 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088}
11089
Martin v. Löwis47383402007-08-15 07:32:56 +000011090int
11091PyUnicode_IsIdentifier(PyObject *self)
11092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 int kind;
11094 void *data;
11095 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011096 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (PyUnicode_READY(self) == -1) {
11099 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011100 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 }
11102
11103 /* Special case for empty strings */
11104 if (PyUnicode_GET_LENGTH(self) == 0)
11105 return 0;
11106 kind = PyUnicode_KIND(self);
11107 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011108
11109 /* PEP 3131 says that the first character must be in
11110 XID_Start and subsequent characters in XID_Continue,
11111 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011112 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011113 letters, digits, underscore). However, given the current
11114 definition of XID_Start and XID_Continue, it is sufficient
11115 to check just for these, except that _ must be allowed
11116 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011118 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011119 return 0;
11120
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011121 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011124 return 1;
11125}
11126
11127PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011128 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011129\n\
11130Return True if S is a valid identifier according\n\
11131to the language definition.");
11132
11133static PyObject*
11134unicode_isidentifier(PyObject *self)
11135{
11136 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11137}
11138
Georg Brandl559e5d72008-06-11 18:37:52 +000011139PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011141\n\
11142Return True if all characters in S are considered\n\
11143printable in repr() or S is empty, False otherwise.");
11144
11145static PyObject*
11146unicode_isprintable(PyObject *self)
11147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 Py_ssize_t i, length;
11149 int kind;
11150 void *data;
11151
11152 if (PyUnicode_READY(self) == -1)
11153 return NULL;
11154 length = PyUnicode_GET_LENGTH(self);
11155 kind = PyUnicode_KIND(self);
11156 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011157
11158 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (length == 1)
11160 return PyBool_FromLong(
11161 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 for (i = 0; i < length; i++) {
11164 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011165 Py_RETURN_FALSE;
11166 }
11167 }
11168 Py_RETURN_TRUE;
11169}
11170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011172 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173\n\
11174Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011175iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011178unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011180 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181}
11182
Martin v. Löwis18e16552006-02-15 17:27:45 +000011183static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184unicode_length(PyUnicodeObject *self)
11185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (PyUnicode_READY(self) == -1)
11187 return -1;
11188 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189}
11190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011194Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011195done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011198unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011200 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 Py_UCS4 fillchar = ' ';
11202
11203 if (PyUnicode_READY(self) == -1)
11204 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011205
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011206 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 return NULL;
11208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 Py_INCREF(self);
11211 return (PyObject*) self;
11212 }
11213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215}
11216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011217PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
11222static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011223unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 return fixup(self, fixlower);
11226}
11227
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011228#define LEFTSTRIP 0
11229#define RIGHTSTRIP 1
11230#define BOTHSTRIP 2
11231
11232/* Arrays indexed by above */
11233static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11234
11235#define STRIPNAME(i) (stripformat[i]+3)
11236
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011237/* externally visible for str.strip(unicode) */
11238PyObject *
11239_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 void *data;
11242 int kind;
11243 Py_ssize_t i, j, len;
11244 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11247 return NULL;
11248
11249 kind = PyUnicode_KIND(self);
11250 data = PyUnicode_DATA(self);
11251 len = PyUnicode_GET_LENGTH(self);
11252 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11253 PyUnicode_DATA(sepobj),
11254 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011255
Benjamin Peterson14339b62009-01-31 16:36:08 +000011256 i = 0;
11257 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 while (i < len &&
11259 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 i++;
11261 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011262 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011263
Benjamin Peterson14339b62009-01-31 16:36:08 +000011264 j = len;
11265 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 do {
11267 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 } while (j >= i &&
11269 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011271 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011272
Victor Stinner12bab6d2011-10-01 01:53:49 +020011273 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274}
11275
11276PyObject*
11277PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11278{
11279 unsigned char *data;
11280 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011281 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282
Victor Stinnerde636f32011-10-01 03:55:54 +020011283 if (PyUnicode_READY(self) == -1)
11284 return NULL;
11285
11286 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11287
Victor Stinner12bab6d2011-10-01 01:53:49 +020011288 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011290 if (PyUnicode_CheckExact(self)) {
11291 Py_INCREF(self);
11292 return self;
11293 }
11294 else
11295 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 }
11297
Victor Stinner12bab6d2011-10-01 01:53:49 +020011298 length = end - start;
11299 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011300 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301
Victor Stinnerde636f32011-10-01 03:55:54 +020011302 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011303 PyErr_SetString(PyExc_IndexError, "string index out of range");
11304 return NULL;
11305 }
11306
Victor Stinnerb9275c12011-10-05 14:01:42 +020011307 if (PyUnicode_IS_ASCII(self)) {
11308 kind = PyUnicode_KIND(self);
11309 data = PyUnicode_1BYTE_DATA(self);
11310 return unicode_fromascii(data + start, length);
11311 }
11312 else {
11313 kind = PyUnicode_KIND(self);
11314 data = PyUnicode_1BYTE_DATA(self);
11315 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011316 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011317 length);
11318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
11321static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011322do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 int kind;
11325 void *data;
11326 Py_ssize_t len, i, j;
11327
11328 if (PyUnicode_READY(self) == -1)
11329 return NULL;
11330
11331 kind = PyUnicode_KIND(self);
11332 data = PyUnicode_DATA(self);
11333 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011334
Benjamin Peterson14339b62009-01-31 16:36:08 +000011335 i = 0;
11336 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011338 i++;
11339 }
11340 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011341
Benjamin Peterson14339b62009-01-31 16:36:08 +000011342 j = len;
11343 if (striptype != LEFTSTRIP) {
11344 do {
11345 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011347 j++;
11348 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011349
Victor Stinner12bab6d2011-10-01 01:53:49 +020011350 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011353
11354static PyObject *
11355do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11356{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011357 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011358
Benjamin Peterson14339b62009-01-31 16:36:08 +000011359 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11360 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011361
Benjamin Peterson14339b62009-01-31 16:36:08 +000011362 if (sep != NULL && sep != Py_None) {
11363 if (PyUnicode_Check(sep))
11364 return _PyUnicode_XStrip(self, striptype, sep);
11365 else {
11366 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "%s arg must be None or str",
11368 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011369 return NULL;
11370 }
11371 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011372
Benjamin Peterson14339b62009-01-31 16:36:08 +000011373 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011374}
11375
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011379\n\
11380Return a copy of the string S with leading and trailing\n\
11381whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011382If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011383
11384static PyObject *
11385unicode_strip(PyUnicodeObject *self, PyObject *args)
11386{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011387 if (PyTuple_GET_SIZE(args) == 0)
11388 return do_strip(self, BOTHSTRIP); /* Common case */
11389 else
11390 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011391}
11392
11393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011394PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011396\n\
11397Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011398If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011399
11400static PyObject *
11401unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11402{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 if (PyTuple_GET_SIZE(args) == 0)
11404 return do_strip(self, LEFTSTRIP); /* Common case */
11405 else
11406 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011407}
11408
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011412\n\
11413Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011414If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011415
11416static PyObject *
11417unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11418{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011419 if (PyTuple_GET_SIZE(args) == 0)
11420 return do_strip(self, RIGHTSTRIP); /* Common case */
11421 else
11422 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011423}
11424
11425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011427unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
11429 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Georg Brandl222de0f2009-04-12 12:01:50 +000011432 if (len < 1) {
11433 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011434 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
Tim Peters7a29bd52001-09-12 03:03:31 +000011437 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 /* no repeat, return original string */
11439 Py_INCREF(str);
11440 return (PyObject*) str;
11441 }
Tim Peters8f422462000-09-09 06:13:41 +000011442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (PyUnicode_READY(str) == -1)
11444 return NULL;
11445
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011446 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011447 PyErr_SetString(PyExc_OverflowError,
11448 "repeated string is too long");
11449 return NULL;
11450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 if (!u)
11455 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011456 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (PyUnicode_GET_LENGTH(str) == 1) {
11459 const int kind = PyUnicode_KIND(str);
11460 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11461 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011462 if (kind == PyUnicode_1BYTE_KIND)
11463 memset(to, (unsigned char)fill_char, len);
11464 else {
11465 for (n = 0; n < len; ++n)
11466 PyUnicode_WRITE(kind, to, n, fill_char);
11467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 }
11469 else {
11470 /* number of characters copied this far */
11471 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011472 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 char *to = (char *) PyUnicode_DATA(u);
11474 Py_MEMCPY(to, PyUnicode_DATA(str),
11475 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 n = (done <= nchars-done) ? done : nchars-done;
11478 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011479 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 }
11482
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011483 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 return (PyObject*) u;
11485}
11486
Alexander Belopolsky40018472011-02-26 01:02:56 +000011487PyObject *
11488PyUnicode_Replace(PyObject *obj,
11489 PyObject *subobj,
11490 PyObject *replobj,
11491 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
11493 PyObject *self;
11494 PyObject *str1;
11495 PyObject *str2;
11496 PyObject *result;
11497
11498 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011499 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011502 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 Py_DECREF(self);
11504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 }
11506 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011507 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 Py_DECREF(self);
11509 Py_DECREF(str1);
11510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 Py_DECREF(self);
11514 Py_DECREF(str1);
11515 Py_DECREF(str2);
11516 return result;
11517}
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011520 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
11522Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011523old replaced by new. If the optional argument count is\n\
11524given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525
11526static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 PyObject *str1;
11530 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011531 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 PyObject *result;
11533
Martin v. Löwis18e16552006-02-15 17:27:45 +000011534 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 str1 = PyUnicode_FromObject(str1);
11539 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11540 return NULL;
11541 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011542 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 Py_DECREF(str1);
11544 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
11547 result = replace(self, str1, str2, maxcount);
11548
11549 Py_DECREF(str1);
11550 Py_DECREF(str2);
11551 return result;
11552}
11553
Alexander Belopolsky40018472011-02-26 01:02:56 +000011554static PyObject *
11555unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011557 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 Py_ssize_t isize;
11559 Py_ssize_t osize, squote, dquote, i, o;
11560 Py_UCS4 max, quote;
11561 int ikind, okind;
11562 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011565 return NULL;
11566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 isize = PyUnicode_GET_LENGTH(unicode);
11568 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 /* Compute length of output, quote characters, and
11571 maximum character */
11572 osize = 2; /* quotes */
11573 max = 127;
11574 squote = dquote = 0;
11575 ikind = PyUnicode_KIND(unicode);
11576 for (i = 0; i < isize; i++) {
11577 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11578 switch (ch) {
11579 case '\'': squote++; osize++; break;
11580 case '"': dquote++; osize++; break;
11581 case '\\': case '\t': case '\r': case '\n':
11582 osize += 2; break;
11583 default:
11584 /* Fast-path ASCII */
11585 if (ch < ' ' || ch == 0x7f)
11586 osize += 4; /* \xHH */
11587 else if (ch < 0x7f)
11588 osize++;
11589 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11590 osize++;
11591 max = ch > max ? ch : max;
11592 }
11593 else if (ch < 0x100)
11594 osize += 4; /* \xHH */
11595 else if (ch < 0x10000)
11596 osize += 6; /* \uHHHH */
11597 else
11598 osize += 10; /* \uHHHHHHHH */
11599 }
11600 }
11601
11602 quote = '\'';
11603 if (squote) {
11604 if (dquote)
11605 /* Both squote and dquote present. Use squote,
11606 and escape them */
11607 osize += squote;
11608 else
11609 quote = '"';
11610 }
11611
11612 repr = PyUnicode_New(osize, max);
11613 if (repr == NULL)
11614 return NULL;
11615 okind = PyUnicode_KIND(repr);
11616 odata = PyUnicode_DATA(repr);
11617
11618 PyUnicode_WRITE(okind, odata, 0, quote);
11619 PyUnicode_WRITE(okind, odata, osize-1, quote);
11620
11621 for (i = 0, o = 1; i < isize; i++) {
11622 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011623
11624 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if ((ch == quote) || (ch == '\\')) {
11626 PyUnicode_WRITE(okind, odata, o++, '\\');
11627 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011628 continue;
11629 }
11630
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011632 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 PyUnicode_WRITE(okind, odata, o++, '\\');
11634 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011635 }
11636 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 PyUnicode_WRITE(okind, odata, o++, '\\');
11638 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011639 }
11640 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 PyUnicode_WRITE(okind, odata, o++, '\\');
11642 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011643 }
11644
11645 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011646 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 PyUnicode_WRITE(okind, odata, o++, '\\');
11648 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011649 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11650 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011651 }
11652
Georg Brandl559e5d72008-06-11 18:37:52 +000011653 /* Copy ASCII characters as-is */
11654 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011656 }
11657
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011659 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011660 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011661 (categories Z* and C* except ASCII space)
11662 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011664 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (ch <= 0xff) {
11666 PyUnicode_WRITE(okind, odata, o++, '\\');
11667 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011668 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11669 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011670 }
11671 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 else if (ch >= 0x10000) {
11673 PyUnicode_WRITE(okind, odata, o++, '\\');
11674 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11676 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11677 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11678 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11679 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11680 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11681 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11682 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011683 }
11684 /* Map 16-bit characters to '\uxxxx' */
11685 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 PyUnicode_WRITE(okind, odata, o++, '\\');
11687 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011688 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11689 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011692 }
11693 }
11694 /* Copy characters as-is */
11695 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011697 }
11698 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011701 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011702 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703}
11704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011705PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707\n\
11708Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011709such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710arguments start and end are interpreted as in slice notation.\n\
11711\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011712Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
11714static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Jesus Ceaac451502011-04-20 17:09:23 +020011717 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011718 Py_ssize_t start;
11719 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011720 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Jesus Ceaac451502011-04-20 17:09:23 +020011722 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11723 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 if (PyUnicode_READY(self) == -1)
11727 return NULL;
11728 if (PyUnicode_READY(substring) == -1)
11729 return NULL;
11730
Victor Stinner794d5672011-10-10 03:21:36 +020011731 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011733 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (result == -2)
11738 return NULL;
11739
Christian Heimes217cfd12007-12-02 14:31:20 +000011740 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741}
11742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011743PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011746Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Jesus Ceaac451502011-04-20 17:09:23 +020011751 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011752 Py_ssize_t start;
11753 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011754 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
Jesus Ceaac451502011-04-20 17:09:23 +020011756 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11757 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (PyUnicode_READY(self) == -1)
11761 return NULL;
11762 if (PyUnicode_READY(substring) == -1)
11763 return NULL;
11764
Victor Stinner794d5672011-10-10 03:21:36 +020011765 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011767 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
11769 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 if (result == -2)
11772 return NULL;
11773
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 if (result < 0) {
11775 PyErr_SetString(PyExc_ValueError, "substring not found");
11776 return NULL;
11777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778
Christian Heimes217cfd12007-12-02 14:31:20 +000011779 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011782PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011785Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011786done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
11788static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011789unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011791 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 Py_UCS4 fillchar = ' ';
11793
Victor Stinnere9a29352011-10-01 02:14:59 +020011794 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011796
Victor Stinnere9a29352011-10-01 02:14:59 +020011797 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 return NULL;
11799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 Py_INCREF(self);
11802 return (PyObject*) self;
11803 }
11804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806}
11807
Alexander Belopolsky40018472011-02-26 01:02:56 +000011808PyObject *
11809PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
11811 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011812
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 s = PyUnicode_FromObject(s);
11814 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 if (sep != NULL) {
11817 sep = PyUnicode_FromObject(sep);
11818 if (sep == NULL) {
11819 Py_DECREF(s);
11820 return NULL;
11821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 }
11823
Victor Stinner9310abb2011-10-05 00:59:23 +020011824 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
11826 Py_DECREF(s);
11827 Py_XDECREF(sep);
11828 return result;
11829}
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833\n\
11834Return a list of the words in S, using sep as the\n\
11835delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011836splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011837whitespace string is a separator and empty strings are\n\
11838removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011841unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
11843 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011844 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845
Martin v. Löwis18e16552006-02-15 17:27:45 +000011846 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 return NULL;
11848
11849 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011852 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855}
11856
Thomas Wouters477c8d52006-05-27 19:21:47 +000011857PyObject *
11858PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11859{
11860 PyObject* str_obj;
11861 PyObject* sep_obj;
11862 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 int kind1, kind2, kind;
11864 void *buf1 = NULL, *buf2 = NULL;
11865 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011866
11867 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011868 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011870 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011872 Py_DECREF(str_obj);
11873 return NULL;
11874 }
11875
Victor Stinner14f8f022011-10-05 20:58:25 +020011876 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011878 kind = Py_MAX(kind1, kind2);
11879 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011881 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (!buf1)
11883 goto onError;
11884 buf2 = PyUnicode_DATA(sep_obj);
11885 if (kind2 != kind)
11886 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11887 if (!buf2)
11888 goto onError;
11889 len1 = PyUnicode_GET_LENGTH(str_obj);
11890 len2 = PyUnicode_GET_LENGTH(sep_obj);
11891
Victor Stinner14f8f022011-10-05 20:58:25 +020011892 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011894 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11895 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11896 else
11897 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 break;
11899 case PyUnicode_2BYTE_KIND:
11900 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11901 break;
11902 case PyUnicode_4BYTE_KIND:
11903 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11904 break;
11905 default:
11906 assert(0);
11907 out = 0;
11908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011909
11910 Py_DECREF(sep_obj);
11911 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (kind1 != kind)
11913 PyMem_Free(buf1);
11914 if (kind2 != kind)
11915 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011916
11917 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 onError:
11919 Py_DECREF(sep_obj);
11920 Py_DECREF(str_obj);
11921 if (kind1 != kind && buf1)
11922 PyMem_Free(buf1);
11923 if (kind2 != kind && buf2)
11924 PyMem_Free(buf2);
11925 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011926}
11927
11928
11929PyObject *
11930PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11931{
11932 PyObject* str_obj;
11933 PyObject* sep_obj;
11934 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 int kind1, kind2, kind;
11936 void *buf1 = NULL, *buf2 = NULL;
11937 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011938
11939 str_obj = PyUnicode_FromObject(str_in);
11940 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011942 sep_obj = PyUnicode_FromObject(sep_in);
11943 if (!sep_obj) {
11944 Py_DECREF(str_obj);
11945 return NULL;
11946 }
11947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 kind1 = PyUnicode_KIND(str_in);
11949 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011950 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 buf1 = PyUnicode_DATA(str_in);
11952 if (kind1 != kind)
11953 buf1 = _PyUnicode_AsKind(str_in, kind);
11954 if (!buf1)
11955 goto onError;
11956 buf2 = PyUnicode_DATA(sep_obj);
11957 if (kind2 != kind)
11958 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11959 if (!buf2)
11960 goto onError;
11961 len1 = PyUnicode_GET_LENGTH(str_obj);
11962 len2 = PyUnicode_GET_LENGTH(sep_obj);
11963
11964 switch(PyUnicode_KIND(str_in)) {
11965 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011966 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11967 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11968 else
11969 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 break;
11971 case PyUnicode_2BYTE_KIND:
11972 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11973 break;
11974 case PyUnicode_4BYTE_KIND:
11975 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11976 break;
11977 default:
11978 assert(0);
11979 out = 0;
11980 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011981
11982 Py_DECREF(sep_obj);
11983 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (kind1 != kind)
11985 PyMem_Free(buf1);
11986 if (kind2 != kind)
11987 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011988
11989 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 onError:
11991 Py_DECREF(sep_obj);
11992 Py_DECREF(str_obj);
11993 if (kind1 != kind && buf1)
11994 PyMem_Free(buf1);
11995 if (kind2 != kind && buf2)
11996 PyMem_Free(buf2);
11997 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998}
11999
12000PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012002\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012003Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012004the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012005found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012006
12007static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012008unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012009{
Victor Stinner9310abb2011-10-05 00:59:23 +020012010 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012011}
12012
12013PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012014 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012015\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012016Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012017the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012018separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012019
12020static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012021unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012022{
Victor Stinner9310abb2011-10-05 00:59:23 +020012023 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024}
12025
Alexander Belopolsky40018472011-02-26 01:02:56 +000012026PyObject *
12027PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012028{
12029 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012030
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012031 s = PyUnicode_FromObject(s);
12032 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012033 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 if (sep != NULL) {
12035 sep = PyUnicode_FromObject(sep);
12036 if (sep == NULL) {
12037 Py_DECREF(s);
12038 return NULL;
12039 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012040 }
12041
Victor Stinner9310abb2011-10-05 00:59:23 +020012042 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012043
12044 Py_DECREF(s);
12045 Py_XDECREF(sep);
12046 return result;
12047}
12048
12049PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012051\n\
12052Return a list of the words in S, using sep as the\n\
12053delimiter string, starting at the end of the string and\n\
12054working to the front. If maxsplit is given, at most maxsplit\n\
12055splits are done. If sep is not specified, any whitespace string\n\
12056is a separator.");
12057
12058static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012059unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012060{
12061 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012062 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012063
Martin v. Löwis18e16552006-02-15 17:27:45 +000012064 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012065 return NULL;
12066
12067 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012069 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012070 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012071 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012072 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012073}
12074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077\n\
12078Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012079Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012080is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
12082static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012083unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012085 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012086 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012088 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12089 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 return NULL;
12091
Guido van Rossum86662912000-04-11 15:38:46 +000012092 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093}
12094
12095static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012096PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Walter Dörwald346737f2007-05-31 10:44:43 +000012098 if (PyUnicode_CheckExact(self)) {
12099 Py_INCREF(self);
12100 return self;
12101 } else
12102 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012103 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012106PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108\n\
12109Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012110and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111
12112static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012113unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115 return fixup(self, fixswapcase);
12116}
12117
Georg Brandlceee0772007-11-27 23:48:05 +000012118PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012120\n\
12121Return a translation table usable for str.translate().\n\
12122If there is only one argument, it must be a dictionary mapping Unicode\n\
12123ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012124Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012125If there are two arguments, they must be strings of equal length, and\n\
12126in the resulting dictionary, each character in x will be mapped to the\n\
12127character at the same position in y. If there is a third argument, it\n\
12128must be a string, whose characters will be mapped to None in the result.");
12129
12130static PyObject*
12131unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12132{
12133 PyObject *x, *y = NULL, *z = NULL;
12134 PyObject *new = NULL, *key, *value;
12135 Py_ssize_t i = 0;
12136 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137
Georg Brandlceee0772007-11-27 23:48:05 +000012138 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12139 return NULL;
12140 new = PyDict_New();
12141 if (!new)
12142 return NULL;
12143 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 int x_kind, y_kind, z_kind;
12145 void *x_data, *y_data, *z_data;
12146
Georg Brandlceee0772007-11-27 23:48:05 +000012147 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012148 if (!PyUnicode_Check(x)) {
12149 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12150 "be a string if there is a second argument");
12151 goto err;
12152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012154 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12155 "arguments must have equal length");
12156 goto err;
12157 }
12158 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 x_kind = PyUnicode_KIND(x);
12160 y_kind = PyUnicode_KIND(y);
12161 x_data = PyUnicode_DATA(x);
12162 y_data = PyUnicode_DATA(y);
12163 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12164 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12165 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012166 if (!key || !value)
12167 goto err;
12168 res = PyDict_SetItem(new, key, value);
12169 Py_DECREF(key);
12170 Py_DECREF(value);
12171 if (res < 0)
12172 goto err;
12173 }
12174 /* create entries for deleting chars in z */
12175 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 z_kind = PyUnicode_KIND(z);
12177 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012178 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012180 if (!key)
12181 goto err;
12182 res = PyDict_SetItem(new, key, Py_None);
12183 Py_DECREF(key);
12184 if (res < 0)
12185 goto err;
12186 }
12187 }
12188 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 int kind;
12190 void *data;
12191
Georg Brandlceee0772007-11-27 23:48:05 +000012192 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012193 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012194 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12195 "to maketrans it must be a dict");
12196 goto err;
12197 }
12198 /* copy entries into the new dict, converting string keys to int keys */
12199 while (PyDict_Next(x, &i, &key, &value)) {
12200 if (PyUnicode_Check(key)) {
12201 /* convert string keys to integer keys */
12202 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012203 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012204 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12205 "table must be of length 1");
12206 goto err;
12207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 kind = PyUnicode_KIND(key);
12209 data = PyUnicode_DATA(key);
12210 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012211 if (!newkey)
12212 goto err;
12213 res = PyDict_SetItem(new, newkey, value);
12214 Py_DECREF(newkey);
12215 if (res < 0)
12216 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012217 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012218 /* just keep integer keys */
12219 if (PyDict_SetItem(new, key, value) < 0)
12220 goto err;
12221 } else {
12222 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12223 "be strings or integers");
12224 goto err;
12225 }
12226 }
12227 }
12228 return new;
12229 err:
12230 Py_DECREF(new);
12231 return NULL;
12232}
12233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012234PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236\n\
12237Return a copy of the string S, where all characters have been mapped\n\
12238through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012239Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012240Unmapped characters are left untouched. Characters mapped to None\n\
12241are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
12243static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012249PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012252Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
12254static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012255unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 return fixup(self, fixupper);
12258}
12259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012260PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012263Pad a numeric string S with zeros on the left, to fill a field\n\
12264of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
12266static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012267unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012269 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012270 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012271 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 int kind;
12273 void *data;
12274 Py_UCS4 chr;
12275
12276 if (PyUnicode_READY(self) == -1)
12277 return NULL;
12278
Martin v. Löwis18e16552006-02-15 17:27:45 +000012279 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 return NULL;
12281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012283 if (PyUnicode_CheckExact(self)) {
12284 Py_INCREF(self);
12285 return (PyObject*) self;
12286 }
12287 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012288 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 }
12290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
12293 u = pad(self, fill, 0, '0');
12294
Walter Dörwald068325e2002-04-15 13:36:47 +000012295 if (u == NULL)
12296 return NULL;
12297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 kind = PyUnicode_KIND(u);
12299 data = PyUnicode_DATA(u);
12300 chr = PyUnicode_READ(kind, data, fill);
12301
12302 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 PyUnicode_WRITE(kind, data, 0, chr);
12305 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306 }
12307
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012308 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 return (PyObject*) u;
12310}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
12312#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012313static PyObject *
12314unicode__decimal2ascii(PyObject *self)
12315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012317}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318#endif
12319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012320PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012323Return True if S starts with the specified prefix, False otherwise.\n\
12324With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012325With optional end, stop comparing S at that position.\n\
12326prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
12328static PyObject *
12329unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012332 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012334 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012335 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012336 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337
Jesus Ceaac451502011-04-20 17:09:23 +020012338 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012340 if (PyTuple_Check(subobj)) {
12341 Py_ssize_t i;
12342 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12343 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012345 if (substring == NULL)
12346 return NULL;
12347 result = tailmatch(self, substring, start, end, -1);
12348 Py_DECREF(substring);
12349 if (result) {
12350 Py_RETURN_TRUE;
12351 }
12352 }
12353 /* nothing matched */
12354 Py_RETURN_FALSE;
12355 }
12356 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012357 if (substring == NULL) {
12358 if (PyErr_ExceptionMatches(PyExc_TypeError))
12359 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12360 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012362 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012363 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012365 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366}
12367
12368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012369PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012372Return True if S ends with the specified suffix, False otherwise.\n\
12373With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012374With optional end, stop comparing S at that position.\n\
12375suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376
12377static PyObject *
12378unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012381 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012383 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012384 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012385 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386
Jesus Ceaac451502011-04-20 17:09:23 +020012387 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012389 if (PyTuple_Check(subobj)) {
12390 Py_ssize_t i;
12391 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12392 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012394 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012396 result = tailmatch(self, substring, start, end, +1);
12397 Py_DECREF(substring);
12398 if (result) {
12399 Py_RETURN_TRUE;
12400 }
12401 }
12402 Py_RETURN_FALSE;
12403 }
12404 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012405 if (substring == NULL) {
12406 if (PyErr_ExceptionMatches(PyExc_TypeError))
12407 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12408 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012410 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012411 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012413 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414}
12415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012417
12418PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012420\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012421Return a formatted version of S, using substitutions from args and kwargs.\n\
12422The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012423
Eric Smith27bbca62010-11-04 17:06:58 +000012424PyDoc_STRVAR(format_map__doc__,
12425 "S.format_map(mapping) -> str\n\
12426\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012427Return a formatted version of S, using substitutions from mapping.\n\
12428The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012429
Eric Smith4a7d76d2008-05-30 18:10:19 +000012430static PyObject *
12431unicode__format__(PyObject* self, PyObject* args)
12432{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012433 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012434
12435 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12436 return NULL;
12437
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012438 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012440 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012441}
12442
Eric Smith8c663262007-08-25 02:26:07 +000012443PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012445\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012446Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012447
12448static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012449unicode__sizeof__(PyUnicodeObject *v)
12450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 Py_ssize_t size;
12452
12453 /* If it's a compact object, account for base structure +
12454 character data. */
12455 if (PyUnicode_IS_COMPACT_ASCII(v))
12456 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12457 else if (PyUnicode_IS_COMPACT(v))
12458 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012459 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 else {
12461 /* If it is a two-block object, account for base object, and
12462 for character block if present. */
12463 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012464 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012466 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 }
12468 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012469 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012470 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012472 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012473 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474
12475 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012476}
12477
12478PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012480
12481static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012482unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012483{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012484 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 if (!copy)
12486 return NULL;
12487 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012488}
12489
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490static PyMethodDef unicode_methods[] = {
12491
12492 /* Order is according to common usage: often used methods should
12493 appear first, since lookup is done sequentially. */
12494
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012495 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012496 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12497 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012498 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012499 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12500 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12501 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12502 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12503 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12504 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12505 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012507 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12508 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12509 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012510 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012511 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12512 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12513 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012515 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012516 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012518 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12519 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12520 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12521 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12522 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12523 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12524 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12525 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12526 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12527 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12528 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12529 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12530 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12531 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012532 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012533 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012534 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012535 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012536 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012537 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012538 {"maketrans", (PyCFunction) unicode_maketrans,
12539 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012540 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012541#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012542 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543#endif
12544
12545#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012546 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012547 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548#endif
12549
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 {NULL, NULL}
12552};
12553
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012554static PyObject *
12555unicode_mod(PyObject *v, PyObject *w)
12556{
Brian Curtindfc80e32011-08-10 20:28:54 -050012557 if (!PyUnicode_Check(v))
12558 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012560}
12561
12562static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012563 0, /*nb_add*/
12564 0, /*nb_subtract*/
12565 0, /*nb_multiply*/
12566 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012567};
12568
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570 (lenfunc) unicode_length, /* sq_length */
12571 PyUnicode_Concat, /* sq_concat */
12572 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12573 (ssizeargfunc) unicode_getitem, /* sq_item */
12574 0, /* sq_slice */
12575 0, /* sq_ass_item */
12576 0, /* sq_ass_slice */
12577 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578};
12579
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012580static PyObject*
12581unicode_subscript(PyUnicodeObject* self, PyObject* item)
12582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 if (PyUnicode_READY(self) == -1)
12584 return NULL;
12585
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012586 if (PyIndex_Check(item)) {
12587 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012588 if (i == -1 && PyErr_Occurred())
12589 return NULL;
12590 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012592 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012593 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012594 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012595 PyObject *result;
12596 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012597 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012598 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012602 return NULL;
12603 }
12604
12605 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 return PyUnicode_New(0, 0);
12607 } else if (start == 0 && step == 1 &&
12608 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012609 PyUnicode_CheckExact(self)) {
12610 Py_INCREF(self);
12611 return (PyObject *)self;
12612 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012613 return PyUnicode_Substring((PyObject*)self,
12614 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012615 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012616 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012617 src_kind = PyUnicode_KIND(self);
12618 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012619 if (!PyUnicode_IS_ASCII(self)) {
12620 kind_limit = kind_maxchar_limit(src_kind);
12621 max_char = 0;
12622 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12623 ch = PyUnicode_READ(src_kind, src_data, cur);
12624 if (ch > max_char) {
12625 max_char = ch;
12626 if (max_char >= kind_limit)
12627 break;
12628 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012629 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012630 }
Victor Stinner55c99112011-10-13 01:17:06 +020012631 else
12632 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012633 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012634 if (result == NULL)
12635 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012636 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012637 dest_data = PyUnicode_DATA(result);
12638
12639 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012640 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12641 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012642 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012643 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012644 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012645 } else {
12646 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12647 return NULL;
12648 }
12649}
12650
12651static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 (lenfunc)unicode_length, /* mp_length */
12653 (binaryfunc)unicode_subscript, /* mp_subscript */
12654 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012655};
12656
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658/* Helpers for PyUnicode_Format() */
12659
12660static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012661getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012663 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 (*p_argidx)++;
12666 if (arglen < 0)
12667 return args;
12668 else
12669 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 }
12671 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 return NULL;
12674}
12675
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012676/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012678static PyObject *
12679formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012681 char *p;
12682 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012684
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685 x = PyFloat_AsDouble(v);
12686 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012687 return NULL;
12688
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012691
Eric Smith0923d1d2009-04-16 20:16:10 +000012692 p = PyOS_double_to_string(x, type, prec,
12693 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012694 if (p == NULL)
12695 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012697 PyMem_Free(p);
12698 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699}
12700
Tim Peters38fd5b62000-09-21 05:43:11 +000012701static PyObject*
12702formatlong(PyObject *val, int flags, int prec, int type)
12703{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 char *buf;
12705 int len;
12706 PyObject *str; /* temporary string object. */
12707 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012708
Benjamin Peterson14339b62009-01-31 16:36:08 +000012709 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12710 if (!str)
12711 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012713 Py_DECREF(str);
12714 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012715}
12716
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012717static Py_UCS4
12718formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012720 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012721 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012723 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 goto onError;
12726 }
12727 else {
12728 /* Integer input truncated to a character */
12729 long x;
12730 x = PyLong_AsLong(v);
12731 if (x == -1 && PyErr_Occurred())
12732 goto onError;
12733
12734 if (x < 0 || x > 0x10ffff) {
12735 PyErr_SetString(PyExc_OverflowError,
12736 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012737 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 }
12739
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012740 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012742
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012744 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012746 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747}
12748
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012749static int
12750repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12751{
12752 int r;
12753 assert(count > 0);
12754 assert(PyUnicode_Check(obj));
12755 if (count > 5) {
12756 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12757 if (repeated == NULL)
12758 return -1;
12759 r = _PyAccu_Accumulate(acc, repeated);
12760 Py_DECREF(repeated);
12761 return r;
12762 }
12763 else {
12764 do {
12765 if (_PyAccu_Accumulate(acc, obj))
12766 return -1;
12767 } while (--count);
12768 return 0;
12769 }
12770}
12771
Alexander Belopolsky40018472011-02-26 01:02:56 +000012772PyObject *
12773PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 void *fmt;
12776 int fmtkind;
12777 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012779 int r;
12780 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012783 PyObject *temp = NULL;
12784 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012786 _PyAccu acc;
12787 static PyObject *plus, *minus, *blank, *zero, *percent;
12788
12789 if (!plus && !(plus = get_latin1_char('+')))
12790 return NULL;
12791 if (!minus && !(minus = get_latin1_char('-')))
12792 return NULL;
12793 if (!blank && !(blank = get_latin1_char(' ')))
12794 return NULL;
12795 if (!zero && !(zero = get_latin1_char('0')))
12796 return NULL;
12797 if (!percent && !(percent = get_latin1_char('%')))
12798 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012799
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 PyErr_BadInternalCall();
12802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12805 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012807 if (_PyAccu_Init(&acc))
12808 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 fmt = PyUnicode_DATA(uformat);
12810 fmtkind = PyUnicode_KIND(uformat);
12811 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12812 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 arglen = PyTuple_Size(args);
12816 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817 }
12818 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 arglen = -1;
12820 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012822 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012823 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825
12826 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012828 PyObject *nonfmt;
12829 Py_ssize_t nonfmtpos;
12830 nonfmtpos = fmtpos++;
12831 while (fmtcnt >= 0 &&
12832 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12833 fmtpos++;
12834 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012836 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12837 if (nonfmt == NULL)
12838 goto onError;
12839 r = _PyAccu_Accumulate(&acc, nonfmt);
12840 Py_DECREF(nonfmt);
12841 if (r)
12842 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012843 }
12844 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 /* Got a format specifier */
12846 int flags = 0;
12847 Py_ssize_t width = -1;
12848 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012850 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 int isnumok;
12852 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012853 void *pbuf = NULL;
12854 Py_ssize_t pindex, len;
12855 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 fmtpos++;
12858 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12859 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 Py_ssize_t keylen;
12861 PyObject *key;
12862 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012863
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 if (dict == NULL) {
12865 PyErr_SetString(PyExc_TypeError,
12866 "format requires a mapping");
12867 goto onError;
12868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 /* Skip over balanced parentheses */
12873 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 if (fmtcnt < 0 || pcount > 0) {
12882 PyErr_SetString(PyExc_ValueError,
12883 "incomplete format key");
12884 goto onError;
12885 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012886 key = PyUnicode_Substring((PyObject*)uformat,
12887 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 if (key == NULL)
12889 goto onError;
12890 if (args_owned) {
12891 Py_DECREF(args);
12892 args_owned = 0;
12893 }
12894 args = PyObject_GetItem(dict, key);
12895 Py_DECREF(key);
12896 if (args == NULL) {
12897 goto onError;
12898 }
12899 args_owned = 1;
12900 arglen = -1;
12901 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012902 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 case '-': flags |= F_LJUST; continue;
12906 case '+': flags |= F_SIGN; continue;
12907 case ' ': flags |= F_BLANK; continue;
12908 case '#': flags |= F_ALT; continue;
12909 case '0': flags |= F_ZERO; continue;
12910 }
12911 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012912 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012913 if (c == '*') {
12914 v = getnextarg(args, arglen, &argidx);
12915 if (v == NULL)
12916 goto onError;
12917 if (!PyLong_Check(v)) {
12918 PyErr_SetString(PyExc_TypeError,
12919 "* wants int");
12920 goto onError;
12921 }
12922 width = PyLong_AsLong(v);
12923 if (width == -1 && PyErr_Occurred())
12924 goto onError;
12925 if (width < 0) {
12926 flags |= F_LJUST;
12927 width = -width;
12928 }
12929 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 }
12932 else if (c >= '0' && c <= '9') {
12933 width = c - '0';
12934 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 if (c < '0' || c > '9')
12937 break;
12938 if ((width*10) / 10 != width) {
12939 PyErr_SetString(PyExc_ValueError,
12940 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012941 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 }
12943 width = width*10 + (c - '0');
12944 }
12945 }
12946 if (c == '.') {
12947 prec = 0;
12948 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 if (c == '*') {
12951 v = getnextarg(args, arglen, &argidx);
12952 if (v == NULL)
12953 goto onError;
12954 if (!PyLong_Check(v)) {
12955 PyErr_SetString(PyExc_TypeError,
12956 "* wants int");
12957 goto onError;
12958 }
12959 prec = PyLong_AsLong(v);
12960 if (prec == -1 && PyErr_Occurred())
12961 goto onError;
12962 if (prec < 0)
12963 prec = 0;
12964 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 }
12967 else if (c >= '0' && c <= '9') {
12968 prec = c - '0';
12969 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 if (c < '0' || c > '9')
12972 break;
12973 if ((prec*10) / 10 != prec) {
12974 PyErr_SetString(PyExc_ValueError,
12975 "prec too big");
12976 goto onError;
12977 }
12978 prec = prec*10 + (c - '0');
12979 }
12980 }
12981 } /* prec */
12982 if (fmtcnt >= 0) {
12983 if (c == 'h' || c == 'l' || c == 'L') {
12984 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 }
12987 }
12988 if (fmtcnt < 0) {
12989 PyErr_SetString(PyExc_ValueError,
12990 "incomplete format");
12991 goto onError;
12992 }
12993 if (c != '%') {
12994 v = getnextarg(args, arglen, &argidx);
12995 if (v == NULL)
12996 goto onError;
12997 }
12998 sign = 0;
12999 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013000 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 switch (c) {
13002
13003 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013004 _PyAccu_Accumulate(&acc, percent);
13005 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013006
13007 case 's':
13008 case 'r':
13009 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013010 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013011 temp = v;
13012 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013013 }
13014 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013015 if (c == 's')
13016 temp = PyObject_Str(v);
13017 else if (c == 'r')
13018 temp = PyObject_Repr(v);
13019 else
13020 temp = PyObject_ASCII(v);
13021 if (temp == NULL)
13022 goto onError;
13023 if (PyUnicode_Check(temp))
13024 /* nothing to do */;
13025 else {
13026 Py_DECREF(temp);
13027 PyErr_SetString(PyExc_TypeError,
13028 "%s argument has non-string str()");
13029 goto onError;
13030 }
13031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 if (PyUnicode_READY(temp) == -1) {
13033 Py_CLEAR(temp);
13034 goto onError;
13035 }
13036 pbuf = PyUnicode_DATA(temp);
13037 kind = PyUnicode_KIND(temp);
13038 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 if (prec >= 0 && len > prec)
13040 len = prec;
13041 break;
13042
13043 case 'i':
13044 case 'd':
13045 case 'u':
13046 case 'o':
13047 case 'x':
13048 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013049 isnumok = 0;
13050 if (PyNumber_Check(v)) {
13051 PyObject *iobj=NULL;
13052
13053 if (PyLong_Check(v)) {
13054 iobj = v;
13055 Py_INCREF(iobj);
13056 }
13057 else {
13058 iobj = PyNumber_Long(v);
13059 }
13060 if (iobj!=NULL) {
13061 if (PyLong_Check(iobj)) {
13062 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013063 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 Py_DECREF(iobj);
13065 if (!temp)
13066 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 if (PyUnicode_READY(temp) == -1) {
13068 Py_CLEAR(temp);
13069 goto onError;
13070 }
13071 pbuf = PyUnicode_DATA(temp);
13072 kind = PyUnicode_KIND(temp);
13073 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 sign = 1;
13075 }
13076 else {
13077 Py_DECREF(iobj);
13078 }
13079 }
13080 }
13081 if (!isnumok) {
13082 PyErr_Format(PyExc_TypeError,
13083 "%%%c format: a number is required, "
13084 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13085 goto onError;
13086 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013087 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013089 fillobj = zero;
13090 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 break;
13092
13093 case 'e':
13094 case 'E':
13095 case 'f':
13096 case 'F':
13097 case 'g':
13098 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013099 temp = formatfloat(v, flags, prec, c);
13100 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 if (PyUnicode_READY(temp) == -1) {
13103 Py_CLEAR(temp);
13104 goto onError;
13105 }
13106 pbuf = PyUnicode_DATA(temp);
13107 kind = PyUnicode_KIND(temp);
13108 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013110 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013112 fillobj = zero;
13113 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 break;
13115
13116 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013117 {
13118 Py_UCS4 ch = formatchar(v);
13119 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013121 temp = _PyUnicode_FromUCS4(&ch, 1);
13122 if (temp == NULL)
13123 goto onError;
13124 pbuf = PyUnicode_DATA(temp);
13125 kind = PyUnicode_KIND(temp);
13126 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013128 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013129
13130 default:
13131 PyErr_Format(PyExc_ValueError,
13132 "unsupported format character '%c' (0x%x) "
13133 "at index %zd",
13134 (31<=c && c<=126) ? (char)c : '?',
13135 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 goto onError;
13138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 /* pbuf is initialized here. */
13140 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013142 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13143 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013145 pindex++;
13146 }
13147 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13148 signobj = plus;
13149 len--;
13150 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013151 }
13152 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013153 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013155 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 else
13157 sign = 0;
13158 }
13159 if (width < len)
13160 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013162 if (fill != ' ') {
13163 assert(signobj != NULL);
13164 if (_PyAccu_Accumulate(&acc, signobj))
13165 goto onError;
13166 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 if (width > len)
13168 width--;
13169 }
13170 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013172 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013174 second = get_latin1_char(
13175 PyUnicode_READ(kind, pbuf, pindex + 1));
13176 pindex += 2;
13177 if (second == NULL ||
13178 _PyAccu_Accumulate(&acc, zero) ||
13179 _PyAccu_Accumulate(&acc, second))
13180 goto onError;
13181 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 width -= 2;
13184 if (width < 0)
13185 width = 0;
13186 len -= 2;
13187 }
13188 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013189 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013190 if (repeat_accumulate(&acc, fillobj, width - len))
13191 goto onError;
13192 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 }
13194 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013195 if (sign) {
13196 assert(signobj != NULL);
13197 if (_PyAccu_Accumulate(&acc, signobj))
13198 goto onError;
13199 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13202 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013203 second = get_latin1_char(
13204 PyUnicode_READ(kind, pbuf, pindex + 1));
13205 pindex += 2;
13206 if (second == NULL ||
13207 _PyAccu_Accumulate(&acc, zero) ||
13208 _PyAccu_Accumulate(&acc, second))
13209 goto onError;
13210 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013211 }
13212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013214 if (temp != NULL) {
13215 assert(pbuf == PyUnicode_DATA(temp));
13216 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013218 else {
13219 const char *p = (const char *) pbuf;
13220 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013221 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013222 v = PyUnicode_FromKindAndData(kind, p, len);
13223 }
13224 if (v == NULL)
13225 goto onError;
13226 r = _PyAccu_Accumulate(&acc, v);
13227 Py_DECREF(v);
13228 if (r)
13229 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013230 if (width > len && repeat_accumulate(&acc, blank, width - len))
13231 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 if (dict && (argidx < arglen) && c != '%') {
13233 PyErr_SetString(PyExc_TypeError,
13234 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 goto onError;
13236 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013237 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 } /* until end */
13240 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013241 PyErr_SetString(PyExc_TypeError,
13242 "not all arguments converted during string formatting");
13243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 }
13245
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013246 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249 }
13250 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013251 Py_XDECREF(temp);
13252 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 return (PyObject *)result;
13254
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013257 Py_XDECREF(temp);
13258 Py_XDECREF(second);
13259 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 }
13263 return NULL;
13264}
13265
Jeremy Hylton938ace62002-07-17 16:30:39 +000013266static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013267unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13268
Tim Peters6d6c1a32001-08-02 04:15:00 +000013269static PyObject *
13270unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13271{
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 static char *kwlist[] = {"object", "encoding", "errors", 0};
13274 char *encoding = NULL;
13275 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013276
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 if (type != &PyUnicode_Type)
13278 return unicode_subtype_new(type, args, kwds);
13279 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 return NULL;
13282 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013283 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 if (encoding == NULL && errors == NULL)
13285 return PyObject_Str(x);
13286 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013287 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013288}
13289
Guido van Rossume023fe02001-08-30 03:12:59 +000013290static PyObject *
13291unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13292{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013293 PyUnicodeObject *unicode, *self;
13294 Py_ssize_t length, char_size;
13295 int share_wstr, share_utf8;
13296 unsigned int kind;
13297 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013298
Benjamin Peterson14339b62009-01-31 16:36:08 +000013299 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013300
13301 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13302 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013304 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013305 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013306 return NULL;
13307
13308 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13309 if (self == NULL) {
13310 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 return NULL;
13312 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013313 kind = PyUnicode_KIND(unicode);
13314 length = PyUnicode_GET_LENGTH(unicode);
13315
13316 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013317#ifdef Py_DEBUG
13318 _PyUnicode_HASH(self) = -1;
13319#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013320 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013321#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013322 _PyUnicode_STATE(self).interned = 0;
13323 _PyUnicode_STATE(self).kind = kind;
13324 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013325 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013326 _PyUnicode_STATE(self).ready = 1;
13327 _PyUnicode_WSTR(self) = NULL;
13328 _PyUnicode_UTF8_LENGTH(self) = 0;
13329 _PyUnicode_UTF8(self) = NULL;
13330 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013331 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013332
13333 share_utf8 = 0;
13334 share_wstr = 0;
13335 if (kind == PyUnicode_1BYTE_KIND) {
13336 char_size = 1;
13337 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13338 share_utf8 = 1;
13339 }
13340 else if (kind == PyUnicode_2BYTE_KIND) {
13341 char_size = 2;
13342 if (sizeof(wchar_t) == 2)
13343 share_wstr = 1;
13344 }
13345 else {
13346 assert(kind == PyUnicode_4BYTE_KIND);
13347 char_size = 4;
13348 if (sizeof(wchar_t) == 4)
13349 share_wstr = 1;
13350 }
13351
13352 /* Ensure we won't overflow the length. */
13353 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13354 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013356 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013357 data = PyObject_MALLOC((length + 1) * char_size);
13358 if (data == NULL) {
13359 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 goto onError;
13361 }
13362
Victor Stinnerc3c74152011-10-02 20:39:55 +020013363 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013364 if (share_utf8) {
13365 _PyUnicode_UTF8_LENGTH(self) = length;
13366 _PyUnicode_UTF8(self) = data;
13367 }
13368 if (share_wstr) {
13369 _PyUnicode_WSTR_LENGTH(self) = length;
13370 _PyUnicode_WSTR(self) = (wchar_t *)data;
13371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013373 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013374 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013375 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013376 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013377#ifdef Py_DEBUG
13378 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13379#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013380 return (PyObject *)self;
13381
13382onError:
13383 Py_DECREF(unicode);
13384 Py_DECREF(self);
13385 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013386}
13387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013388PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013390\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013391Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013392encoding defaults to the current default string encoding.\n\
13393errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013394
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013395static PyObject *unicode_iter(PyObject *seq);
13396
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013398 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013399 "str", /* tp_name */
13400 sizeof(PyUnicodeObject), /* tp_size */
13401 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013403 (destructor)unicode_dealloc, /* tp_dealloc */
13404 0, /* tp_print */
13405 0, /* tp_getattr */
13406 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013407 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013408 unicode_repr, /* tp_repr */
13409 &unicode_as_number, /* tp_as_number */
13410 &unicode_as_sequence, /* tp_as_sequence */
13411 &unicode_as_mapping, /* tp_as_mapping */
13412 (hashfunc) unicode_hash, /* tp_hash*/
13413 0, /* tp_call*/
13414 (reprfunc) unicode_str, /* tp_str */
13415 PyObject_GenericGetAttr, /* tp_getattro */
13416 0, /* tp_setattro */
13417 0, /* tp_as_buffer */
13418 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013420 unicode_doc, /* tp_doc */
13421 0, /* tp_traverse */
13422 0, /* tp_clear */
13423 PyUnicode_RichCompare, /* tp_richcompare */
13424 0, /* tp_weaklistoffset */
13425 unicode_iter, /* tp_iter */
13426 0, /* tp_iternext */
13427 unicode_methods, /* tp_methods */
13428 0, /* tp_members */
13429 0, /* tp_getset */
13430 &PyBaseObject_Type, /* tp_base */
13431 0, /* tp_dict */
13432 0, /* tp_descr_get */
13433 0, /* tp_descr_set */
13434 0, /* tp_dictoffset */
13435 0, /* tp_init */
13436 0, /* tp_alloc */
13437 unicode_new, /* tp_new */
13438 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439};
13440
13441/* Initialize the Unicode implementation */
13442
Thomas Wouters78890102000-07-22 19:25:51 +000013443void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013445 int i;
13446
Thomas Wouters477c8d52006-05-27 19:21:47 +000013447 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013449 0x000A, /* LINE FEED */
13450 0x000D, /* CARRIAGE RETURN */
13451 0x001C, /* FILE SEPARATOR */
13452 0x001D, /* GROUP SEPARATOR */
13453 0x001E, /* RECORD SEPARATOR */
13454 0x0085, /* NEXT LINE */
13455 0x2028, /* LINE SEPARATOR */
13456 0x2029, /* PARAGRAPH SEPARATOR */
13457 };
13458
Fred Drakee4315f52000-05-09 19:53:39 +000013459 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013460 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013461 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013462 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013465 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013467 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013469
13470 /* initialize the linebreak bloom filter */
13471 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013473 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013474
13475 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476}
13477
13478/* Finalize the Unicode implementation */
13479
Christian Heimesa156e092008-02-16 07:38:31 +000013480int
13481PyUnicode_ClearFreeList(void)
13482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013484}
13485
Guido van Rossumd57fd912000-03-10 22:53:23 +000013486void
Thomas Wouters78890102000-07-22 19:25:51 +000013487_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013488{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013489 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013490
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013491 Py_XDECREF(unicode_empty);
13492 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013493
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013494 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (unicode_latin1[i]) {
13496 Py_DECREF(unicode_latin1[i]);
13497 unicode_latin1[i] = NULL;
13498 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013499 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013500 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013501 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013503
Walter Dörwald16807132007-05-25 13:52:07 +000013504void
13505PyUnicode_InternInPlace(PyObject **p)
13506{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013507 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13508 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013509#ifdef Py_DEBUG
13510 assert(s != NULL);
13511 assert(_PyUnicode_CHECK(s));
13512#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013513 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013514 return;
13515#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013516 /* If it's a subclass, we don't really know what putting
13517 it in the interned dict might do. */
13518 if (!PyUnicode_CheckExact(s))
13519 return;
13520 if (PyUnicode_CHECK_INTERNED(s))
13521 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013522 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013523 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 return;
13525 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013526 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013527 if (interned == NULL) {
13528 interned = PyDict_New();
13529 if (interned == NULL) {
13530 PyErr_Clear(); /* Don't leave an exception */
13531 return;
13532 }
13533 }
13534 /* It might be that the GetItem call fails even
13535 though the key is present in the dictionary,
13536 namely when this happens during a stack overflow. */
13537 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013539 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013540
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 if (t) {
13542 Py_INCREF(t);
13543 Py_DECREF(*p);
13544 *p = t;
13545 return;
13546 }
Walter Dörwald16807132007-05-25 13:52:07 +000013547
Benjamin Peterson14339b62009-01-31 16:36:08 +000013548 PyThreadState_GET()->recursion_critical = 1;
13549 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13550 PyErr_Clear();
13551 PyThreadState_GET()->recursion_critical = 0;
13552 return;
13553 }
13554 PyThreadState_GET()->recursion_critical = 0;
13555 /* The two references in interned are not counted by refcnt.
13556 The deallocator will take care of this */
13557 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013559}
13560
13561void
13562PyUnicode_InternImmortal(PyObject **p)
13563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13565
Benjamin Peterson14339b62009-01-31 16:36:08 +000013566 PyUnicode_InternInPlace(p);
13567 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013569 Py_INCREF(*p);
13570 }
Walter Dörwald16807132007-05-25 13:52:07 +000013571}
13572
13573PyObject *
13574PyUnicode_InternFromString(const char *cp)
13575{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013576 PyObject *s = PyUnicode_FromString(cp);
13577 if (s == NULL)
13578 return NULL;
13579 PyUnicode_InternInPlace(&s);
13580 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013581}
13582
Alexander Belopolsky40018472011-02-26 01:02:56 +000013583void
13584_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013585{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 PyObject *keys;
13587 PyUnicodeObject *s;
13588 Py_ssize_t i, n;
13589 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013590
Benjamin Peterson14339b62009-01-31 16:36:08 +000013591 if (interned == NULL || !PyDict_Check(interned))
13592 return;
13593 keys = PyDict_Keys(interned);
13594 if (keys == NULL || !PyList_Check(keys)) {
13595 PyErr_Clear();
13596 return;
13597 }
Walter Dörwald16807132007-05-25 13:52:07 +000013598
Benjamin Peterson14339b62009-01-31 16:36:08 +000013599 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13600 detector, interned unicode strings are not forcibly deallocated;
13601 rather, we give them their stolen references back, and then clear
13602 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013603
Benjamin Peterson14339b62009-01-31 16:36:08 +000013604 n = PyList_GET_SIZE(keys);
13605 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013607 for (i = 0; i < n; i++) {
13608 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013609 if (PyUnicode_READY(s) == -1) {
13610 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013614 case SSTATE_NOT_INTERNED:
13615 /* XXX Shouldn't happen */
13616 break;
13617 case SSTATE_INTERNED_IMMORTAL:
13618 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013620 break;
13621 case SSTATE_INTERNED_MORTAL:
13622 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013623 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013624 break;
13625 default:
13626 Py_FatalError("Inconsistent interned string state.");
13627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 }
13630 fprintf(stderr, "total size of all interned strings: "
13631 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13632 "mortal/immortal\n", mortal_size, immortal_size);
13633 Py_DECREF(keys);
13634 PyDict_Clear(interned);
13635 Py_DECREF(interned);
13636 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013637}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013638
13639
13640/********************* Unicode Iterator **************************/
13641
13642typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013643 PyObject_HEAD
13644 Py_ssize_t it_index;
13645 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013646} unicodeiterobject;
13647
13648static void
13649unicodeiter_dealloc(unicodeiterobject *it)
13650{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 _PyObject_GC_UNTRACK(it);
13652 Py_XDECREF(it->it_seq);
13653 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013654}
13655
13656static int
13657unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13658{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013659 Py_VISIT(it->it_seq);
13660 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013661}
13662
13663static PyObject *
13664unicodeiter_next(unicodeiterobject *it)
13665{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013666 PyUnicodeObject *seq;
13667 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013668
Benjamin Peterson14339b62009-01-31 16:36:08 +000013669 assert(it != NULL);
13670 seq = it->it_seq;
13671 if (seq == NULL)
13672 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013673 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13676 int kind = PyUnicode_KIND(seq);
13677 void *data = PyUnicode_DATA(seq);
13678 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13679 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 if (item != NULL)
13681 ++it->it_index;
13682 return item;
13683 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013684
Benjamin Peterson14339b62009-01-31 16:36:08 +000013685 Py_DECREF(seq);
13686 it->it_seq = NULL;
13687 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013688}
13689
13690static PyObject *
13691unicodeiter_len(unicodeiterobject *it)
13692{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013693 Py_ssize_t len = 0;
13694 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013695 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013696 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013697}
13698
13699PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13700
13701static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013702 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013704 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013705};
13706
13707PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013708 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13709 "str_iterator", /* tp_name */
13710 sizeof(unicodeiterobject), /* tp_basicsize */
13711 0, /* tp_itemsize */
13712 /* methods */
13713 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13714 0, /* tp_print */
13715 0, /* tp_getattr */
13716 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013717 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013718 0, /* tp_repr */
13719 0, /* tp_as_number */
13720 0, /* tp_as_sequence */
13721 0, /* tp_as_mapping */
13722 0, /* tp_hash */
13723 0, /* tp_call */
13724 0, /* tp_str */
13725 PyObject_GenericGetAttr, /* tp_getattro */
13726 0, /* tp_setattro */
13727 0, /* tp_as_buffer */
13728 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13729 0, /* tp_doc */
13730 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13731 0, /* tp_clear */
13732 0, /* tp_richcompare */
13733 0, /* tp_weaklistoffset */
13734 PyObject_SelfIter, /* tp_iter */
13735 (iternextfunc)unicodeiter_next, /* tp_iternext */
13736 unicodeiter_methods, /* tp_methods */
13737 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013738};
13739
13740static PyObject *
13741unicode_iter(PyObject *seq)
13742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013743 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013744
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 if (!PyUnicode_Check(seq)) {
13746 PyErr_BadInternalCall();
13747 return NULL;
13748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013749 if (PyUnicode_READY(seq) == -1)
13750 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013751 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13752 if (it == NULL)
13753 return NULL;
13754 it->it_index = 0;
13755 Py_INCREF(seq);
13756 it->it_seq = (PyUnicodeObject *)seq;
13757 _PyObject_GC_TRACK(it);
13758 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013759}
13760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761#define UNIOP(x) Py_UNICODE_##x
13762#define UNIOP_t Py_UNICODE
13763#include "uniops.h"
13764#undef UNIOP
13765#undef UNIOP_t
13766#define UNIOP(x) Py_UCS4_##x
13767#define UNIOP_t Py_UCS4
13768#include "uniops.h"
13769#undef UNIOP
13770#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013771
Victor Stinner71133ff2010-09-01 23:43:53 +000013772Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013773PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013774{
13775 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020013776 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000013777 Py_ssize_t size;
13778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 if (!PyUnicode_Check(unicode)) {
13780 PyErr_BadArgument();
13781 return NULL;
13782 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013783 u = PyUnicode_AsUnicode(object);
13784 if (u == NULL)
13785 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000013786 /* Ensure we won't overflow the size. */
13787 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13788 PyErr_NoMemory();
13789 return NULL;
13790 }
13791 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13792 size *= sizeof(Py_UNICODE);
13793 copy = PyMem_Malloc(size);
13794 if (copy == NULL) {
13795 PyErr_NoMemory();
13796 return NULL;
13797 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013798 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000013799 return copy;
13800}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013801
Georg Brandl66c221e2010-10-14 07:04:07 +000013802/* A _string module, to export formatter_parser and formatter_field_name_split
13803 to the string.Formatter class implemented in Python. */
13804
13805static PyMethodDef _string_methods[] = {
13806 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13807 METH_O, PyDoc_STR("split the argument as a field name")},
13808 {"formatter_parser", (PyCFunction) formatter_parser,
13809 METH_O, PyDoc_STR("parse the argument as a format string")},
13810 {NULL, NULL}
13811};
13812
13813static struct PyModuleDef _string_module = {
13814 PyModuleDef_HEAD_INIT,
13815 "_string",
13816 PyDoc_STR("string helper module"),
13817 0,
13818 _string_methods,
13819 NULL,
13820 NULL,
13821 NULL,
13822 NULL
13823};
13824
13825PyMODINIT_FUNC
13826PyInit__string(void)
13827{
13828 return PyModule_Create(&_string_module);
13829}
13830
13831
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013832#ifdef __cplusplus
13833}
13834#endif