blob: e199a11f2d03c1b018bc2c63fbb193e44185ac9d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522/* --- Unicode Object ----------------------------------------------------- */
523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200525fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526
527Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
528 Py_ssize_t size, Py_UCS4 ch,
529 int direction)
530{
531 /* like wcschr, but doesn't stop at NULL characters */
532 Py_ssize_t i;
533 if (direction == 1) {
534 for(i = 0; i < size; i++)
535 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200536 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537 }
538 else {
539 for(i = size-1; i >= 0; i--)
540 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200541 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 }
543 return NULL;
544}
545
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546static PyObject*
547resize_compact(PyObject *unicode, Py_ssize_t length)
548{
549 Py_ssize_t char_size;
550 Py_ssize_t struct_size;
551 Py_ssize_t new_size;
552 int share_wstr;
553
554 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200555 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200556 if (PyUnicode_IS_COMPACT_ASCII(unicode))
557 struct_size = sizeof(PyASCIIObject);
558 else
559 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200560 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200561
562 _Py_DEC_REFTOTAL;
563 _Py_ForgetReference(unicode);
564
565 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
566 PyErr_NoMemory();
567 return NULL;
568 }
569 new_size = (struct_size + (length + 1) * char_size);
570
571 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
572 if (unicode == NULL) {
573 PyObject_Del(unicode);
574 PyErr_NoMemory();
575 return NULL;
576 }
577 _Py_NewReference(unicode);
578 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200579 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200580 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
582 _PyUnicode_WSTR_LENGTH(unicode) = length;
583 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200584 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
585 length, 0);
586 return unicode;
587}
588
Alexander Belopolsky40018472011-02-26 01:02:56 +0000589static int
Victor Stinner95663112011-10-04 01:03:50 +0200590resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591{
Victor Stinner95663112011-10-04 01:03:50 +0200592 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200594 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000595
Victor Stinner95663112011-10-04 01:03:50 +0200596 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597
598 if (PyUnicode_IS_READY(unicode)) {
599 Py_ssize_t char_size;
600 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200601 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 void *data;
603
604 data = _PyUnicode_DATA_ANY(unicode);
605 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200606 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200607 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
608 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200609 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
610 {
611 PyObject_DEL(_PyUnicode_UTF8(unicode));
612 _PyUnicode_UTF8(unicode) = NULL;
613 _PyUnicode_UTF8_LENGTH(unicode) = 0;
614 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615
616 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
617 PyErr_NoMemory();
618 return -1;
619 }
620 new_size = (length + 1) * char_size;
621
622 data = (PyObject *)PyObject_REALLOC(data, new_size);
623 if (data == NULL) {
624 PyErr_NoMemory();
625 return -1;
626 }
627 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200629 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200630 _PyUnicode_WSTR_LENGTH(unicode) = length;
631 }
632 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200633 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200634 _PyUnicode_UTF8_LENGTH(unicode) = length;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636 _PyUnicode_LENGTH(unicode) = length;
637 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200638 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200639 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200641 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642 }
Victor Stinner95663112011-10-04 01:03:50 +0200643 assert(_PyUnicode_WSTR(unicode) != NULL);
644
645 /* check for integer overflow */
646 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
647 PyErr_NoMemory();
648 return -1;
649 }
650 wstr = _PyUnicode_WSTR(unicode);
651 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
652 if (!wstr) {
653 PyErr_NoMemory();
654 return -1;
655 }
656 _PyUnicode_WSTR(unicode) = wstr;
657 _PyUnicode_WSTR(unicode)[length] = 0;
658 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200659 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 return 0;
661}
662
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663static PyObject*
664resize_copy(PyObject *unicode, Py_ssize_t length)
665{
666 Py_ssize_t copy_length;
667 if (PyUnicode_IS_COMPACT(unicode)) {
668 PyObject *copy;
669 assert(PyUnicode_IS_READY(unicode));
670
671 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
672 if (copy == NULL)
673 return NULL;
674
675 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200676 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200678 }
679 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200680 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 assert(_PyUnicode_WSTR(unicode) != NULL);
682 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200683 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 if (w == NULL)
685 return NULL;
686 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
687 copy_length = Py_MIN(copy_length, length);
688 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
689 copy_length);
690 return (PyObject*)w;
691 }
692}
693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000695 Ux0000 terminated; some code (e.g. new_identifier)
696 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697
698 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700
701*/
702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200704static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705#endif
706
Alexander Belopolsky40018472011-02-26 01:02:56 +0000707static PyUnicodeObject *
708_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709{
710 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (length == 0 && unicode_empty != NULL) {
715 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200716 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 }
718
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000719 /* Ensure we won't overflow the size. */
720 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
721 return (PyUnicodeObject *)PyErr_NoMemory();
722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 if (length < 0) {
724 PyErr_SetString(PyExc_SystemError,
725 "Negative size passed to _PyUnicode_New");
726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 }
728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729#ifdef Py_DEBUG
730 ++unicode_old_new_calls;
731#endif
732
733 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
734 if (unicode == NULL)
735 return NULL;
736 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
737 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
738 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000739 PyErr_NoMemory();
740 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742
Jeremy Hyltond8082792003-09-16 19:41:39 +0000743 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000744 * the caller fails before initializing str -- unicode_resize()
745 * reads str[0], and the Keep-Alive optimization can keep memory
746 * allocated for str alive across a call to unicode_dealloc(unicode).
747 * We don't want unicode_resize to read uninitialized memory in
748 * that case.
749 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750 _PyUnicode_WSTR(unicode)[0] = 0;
751 _PyUnicode_WSTR(unicode)[length] = 0;
752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 _PyUnicode_HASH(unicode) = -1;
754 _PyUnicode_STATE(unicode).interned = 0;
755 _PyUnicode_STATE(unicode).kind = 0;
756 _PyUnicode_STATE(unicode).compact = 0;
757 _PyUnicode_STATE(unicode).ready = 0;
758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200759 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200761 _PyUnicode_UTF8(unicode) = NULL;
762 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000763 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000766 /* XXX UNREF/NEWREF interface should be more symmetrical */
767 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000768 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000769 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771}
772
Victor Stinnerf42dc442011-10-02 23:33:16 +0200773static const char*
774unicode_kind_name(PyObject *unicode)
775{
Victor Stinner42dfd712011-10-03 14:41:45 +0200776 /* don't check consistency: unicode_kind_name() is called from
777 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200778 if (!PyUnicode_IS_COMPACT(unicode))
779 {
780 if (!PyUnicode_IS_READY(unicode))
781 return "wstr";
782 switch(PyUnicode_KIND(unicode))
783 {
784 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200785 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200786 return "legacy ascii";
787 else
788 return "legacy latin1";
789 case PyUnicode_2BYTE_KIND:
790 return "legacy UCS2";
791 case PyUnicode_4BYTE_KIND:
792 return "legacy UCS4";
793 default:
794 return "<legacy invalid kind>";
795 }
796 }
797 assert(PyUnicode_IS_READY(unicode));
798 switch(PyUnicode_KIND(unicode))
799 {
800 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200801 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200802 return "ascii";
803 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200804 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200805 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200806 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200807 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200808 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200809 default:
810 return "<invalid compact kind>";
811 }
812}
813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200815static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816
817/* Functions wrapping macros for use in debugger */
818char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200819 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820}
821
822void *_PyUnicode_compact_data(void *unicode) {
823 return _PyUnicode_COMPACT_DATA(unicode);
824}
825void *_PyUnicode_data(void *unicode){
826 printf("obj %p\n", unicode);
827 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
828 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
829 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
830 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
831 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
832 return PyUnicode_DATA(unicode);
833}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200834
835void
836_PyUnicode_Dump(PyObject *op)
837{
838 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200839 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
840 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
841 void *data;
842 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
843 if (ascii->state.compact)
844 data = (compact + 1);
845 else
846 data = unicode->data.any;
847 if (ascii->wstr == data)
848 printf("shared ");
849 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200850 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200851 printf(" (%zu), ", compact->wstr_length);
852 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
853 printf("shared ");
854 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200855 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200856 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200857}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858#endif
859
860PyObject *
861PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
862{
863 PyObject *obj;
864 PyCompactUnicodeObject *unicode;
865 void *data;
866 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200867 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 Py_ssize_t char_size;
869 Py_ssize_t struct_size;
870
871 /* Optimization for empty strings */
872 if (size == 0 && unicode_empty != NULL) {
873 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200874 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 }
876
877#ifdef Py_DEBUG
878 ++unicode_new_new_calls;
879#endif
880
Victor Stinner9e9d6892011-10-04 01:02:02 +0200881 is_ascii = 0;
882 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 struct_size = sizeof(PyCompactUnicodeObject);
884 if (maxchar < 128) {
885 kind_state = PyUnicode_1BYTE_KIND;
886 char_size = 1;
887 is_ascii = 1;
888 struct_size = sizeof(PyASCIIObject);
889 }
890 else if (maxchar < 256) {
891 kind_state = PyUnicode_1BYTE_KIND;
892 char_size = 1;
893 }
894 else if (maxchar < 65536) {
895 kind_state = PyUnicode_2BYTE_KIND;
896 char_size = 2;
897 if (sizeof(wchar_t) == 2)
898 is_sharing = 1;
899 }
900 else {
901 kind_state = PyUnicode_4BYTE_KIND;
902 char_size = 4;
903 if (sizeof(wchar_t) == 4)
904 is_sharing = 1;
905 }
906
907 /* Ensure we won't overflow the size. */
908 if (size < 0) {
909 PyErr_SetString(PyExc_SystemError,
910 "Negative size passed to PyUnicode_New");
911 return NULL;
912 }
913 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
914 return PyErr_NoMemory();
915
916 /* Duplicated allocation code from _PyObject_New() instead of a call to
917 * PyObject_New() so we are able to allocate space for the object and
918 * it's data buffer.
919 */
920 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
921 if (obj == NULL)
922 return PyErr_NoMemory();
923 obj = PyObject_INIT(obj, &PyUnicode_Type);
924 if (obj == NULL)
925 return NULL;
926
927 unicode = (PyCompactUnicodeObject *)obj;
928 if (is_ascii)
929 data = ((PyASCIIObject*)obj) + 1;
930 else
931 data = unicode + 1;
932 _PyUnicode_LENGTH(unicode) = size;
933 _PyUnicode_HASH(unicode) = -1;
934 _PyUnicode_STATE(unicode).interned = 0;
935 _PyUnicode_STATE(unicode).kind = kind_state;
936 _PyUnicode_STATE(unicode).compact = 1;
937 _PyUnicode_STATE(unicode).ready = 1;
938 _PyUnicode_STATE(unicode).ascii = is_ascii;
939 if (is_ascii) {
940 ((char*)data)[size] = 0;
941 _PyUnicode_WSTR(unicode) = NULL;
942 }
943 else if (kind_state == PyUnicode_1BYTE_KIND) {
944 ((char*)data)[size] = 0;
945 _PyUnicode_WSTR(unicode) = NULL;
946 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200948 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 }
950 else {
951 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200952 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953 if (kind_state == PyUnicode_2BYTE_KIND)
954 ((Py_UCS2*)data)[size] = 0;
955 else /* kind_state == PyUnicode_4BYTE_KIND */
956 ((Py_UCS4*)data)[size] = 0;
957 if (is_sharing) {
958 _PyUnicode_WSTR_LENGTH(unicode) = size;
959 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
960 }
961 else {
962 _PyUnicode_WSTR_LENGTH(unicode) = 0;
963 _PyUnicode_WSTR(unicode) = NULL;
964 }
965 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200966 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 return obj;
968}
969
970#if SIZEOF_WCHAR_T == 2
971/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
972 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200973 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974
975 This function assumes that unicode can hold one more code point than wstr
976 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200977static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
979 PyUnicodeObject *unicode)
980{
981 const wchar_t *iter;
982 Py_UCS4 *ucs4_out;
983
Victor Stinner910337b2011-10-03 03:20:16 +0200984 assert(unicode != NULL);
985 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
987 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
988
989 for (iter = begin; iter < end; ) {
990 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
991 _PyUnicode_GET_LENGTH(unicode)));
992 if (*iter >= 0xD800 && *iter <= 0xDBFF
993 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
994 {
995 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
996 iter += 2;
997 }
998 else {
999 *ucs4_out++ = *iter;
1000 iter++;
1001 }
1002 }
1003 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1004 _PyUnicode_GET_LENGTH(unicode)));
1005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006}
1007#endif
1008
Victor Stinnercd9950f2011-10-02 00:34:53 +02001009static int
1010_PyUnicode_Dirty(PyObject *unicode)
1011{
Victor Stinner910337b2011-10-03 03:20:16 +02001012 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001013 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001014 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001015 "Cannot modify a string having more than 1 reference");
1016 return -1;
1017 }
1018 _PyUnicode_DIRTY(unicode);
1019 return 0;
1020}
1021
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001022static int
1023_copy_characters(PyObject *to, Py_ssize_t to_start,
1024 PyObject *from, Py_ssize_t from_start,
1025 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001026{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001027 unsigned int from_kind, to_kind;
1028 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001029 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001031 assert(PyUnicode_Check(from));
1032 assert(PyUnicode_Check(to));
1033 assert(PyUnicode_IS_READY(from));
1034 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001036 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1037 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1038 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001040 if (how_many == 0)
1041 return 0;
1042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001046 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001048#ifdef Py_DEBUG
1049 if (!check_maxchar
1050 && (from_kind > to_kind
1051 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001052 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1054 Py_UCS4 ch;
1055 Py_ssize_t i;
1056 for (i=0; i < how_many; i++) {
1057 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1058 assert(ch <= to_maxchar);
1059 }
1060 }
1061#endif
1062 fast = (from_kind == to_kind);
1063 if (check_maxchar
1064 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1065 {
1066 /* deny latin1 => ascii */
1067 fast = 0;
1068 }
1069
1070 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001071 Py_MEMCPY((char*)to_data + to_kind * to_start,
1072 (char*)from_data + from_kind * from_start,
1073 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001075 else if (from_kind == PyUnicode_1BYTE_KIND
1076 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001077 {
1078 _PyUnicode_CONVERT_BYTES(
1079 Py_UCS1, Py_UCS2,
1080 PyUnicode_1BYTE_DATA(from) + from_start,
1081 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1082 PyUnicode_2BYTE_DATA(to) + to_start
1083 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001084 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001085 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001086 && to_kind == PyUnicode_4BYTE_KIND)
1087 {
1088 _PyUnicode_CONVERT_BYTES(
1089 Py_UCS1, Py_UCS4,
1090 PyUnicode_1BYTE_DATA(from) + from_start,
1091 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1092 PyUnicode_4BYTE_DATA(to) + to_start
1093 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001094 }
1095 else if (from_kind == PyUnicode_2BYTE_KIND
1096 && to_kind == PyUnicode_4BYTE_KIND)
1097 {
1098 _PyUnicode_CONVERT_BYTES(
1099 Py_UCS2, Py_UCS4,
1100 PyUnicode_2BYTE_DATA(from) + from_start,
1101 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1102 PyUnicode_4BYTE_DATA(to) + to_start
1103 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001104 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001105 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001106 /* check if max_char(from substring) <= max_char(to) */
1107 if (from_kind > to_kind
1108 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001109 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001110 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001111 /* slow path to check for character overflow */
1112 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001113 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001114 Py_ssize_t i;
1115
Victor Stinner56c161a2011-10-06 02:47:11 +02001116#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001117 for (i=0; i < how_many; i++) {
1118 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001119 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001120 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1121 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001122#else
1123 if (!check_maxchar) {
1124 for (i=0; i < how_many; i++) {
1125 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1126 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1127 }
1128 }
1129 else {
1130 for (i=0; i < how_many; i++) {
1131 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1132 if (ch > to_maxchar)
1133 return 1;
1134 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1135 }
1136 }
1137#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001139 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001140 assert(0 && "inconsistent state");
1141 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 }
1143 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 return 0;
1145}
1146
1147static void
1148copy_characters(PyObject *to, Py_ssize_t to_start,
1149 PyObject *from, Py_ssize_t from_start,
1150 Py_ssize_t how_many)
1151{
1152 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1153}
1154
1155Py_ssize_t
1156PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1157 PyObject *from, Py_ssize_t from_start,
1158 Py_ssize_t how_many)
1159{
1160 int err;
1161
1162 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1163 PyErr_BadInternalCall();
1164 return -1;
1165 }
1166
1167 if (PyUnicode_READY(from))
1168 return -1;
1169 if (PyUnicode_READY(to))
1170 return -1;
1171
1172 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1173 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1174 PyErr_Format(PyExc_SystemError,
1175 "Cannot write %zi characters at %zi "
1176 "in a string of %zi characters",
1177 how_many, to_start, PyUnicode_GET_LENGTH(to));
1178 return -1;
1179 }
1180
1181 if (how_many == 0)
1182 return 0;
1183
1184 if (_PyUnicode_Dirty(to))
1185 return -1;
1186
1187 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1188 if (err) {
1189 PyErr_Format(PyExc_SystemError,
1190 "Cannot copy %s characters "
1191 "into a string of %s characters",
1192 unicode_kind_name(from),
1193 unicode_kind_name(to));
1194 return -1;
1195 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001196 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197}
1198
Victor Stinner17222162011-09-28 22:15:37 +02001199/* Find the maximum code point and count the number of surrogate pairs so a
1200 correct string length can be computed before converting a string to UCS4.
1201 This function counts single surrogates as a character and not as a pair.
1202
1203 Return 0 on success, or -1 on error. */
1204static int
1205find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1206 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207{
1208 const wchar_t *iter;
1209
Victor Stinnerc53be962011-10-02 21:33:54 +02001210 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 *num_surrogates = 0;
1212 *maxchar = 0;
1213
1214 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001215 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001217#if SIZEOF_WCHAR_T != 2
1218 if (*maxchar >= 0x10000)
1219 return 0;
1220#endif
1221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222#if SIZEOF_WCHAR_T == 2
1223 if (*iter >= 0xD800 && *iter <= 0xDBFF
1224 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1225 {
1226 Py_UCS4 surrogate_val;
1227 surrogate_val = (((iter[0] & 0x3FF)<<10)
1228 | (iter[1] & 0x3FF)) + 0x10000;
1229 ++(*num_surrogates);
1230 if (surrogate_val > *maxchar)
1231 *maxchar = surrogate_val;
1232 iter += 2;
1233 }
1234 else
1235 iter++;
1236#else
1237 iter++;
1238#endif
1239 }
1240 return 0;
1241}
1242
1243#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001244static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245#endif
1246
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001247static int
1248unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001250 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 wchar_t *end;
1252 Py_UCS4 maxchar = 0;
1253 Py_ssize_t num_surrogates;
1254#if SIZEOF_WCHAR_T == 2
1255 Py_ssize_t length_wo_surrogates;
1256#endif
1257
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001258 assert(p_obj != NULL);
1259 unicode = (PyUnicodeObject *)*p_obj;
1260
Georg Brandl7597add2011-10-05 16:36:47 +02001261 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001262 strings were created using _PyObject_New() and where no canonical
1263 representation (the str field) has been set yet aka strings
1264 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001265 assert(_PyUnicode_CHECK(unicode));
1266 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001268 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001269 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001270 /* Actually, it should neither be interned nor be anything else: */
1271 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272
1273#ifdef Py_DEBUG
1274 ++unicode_ready_calls;
1275#endif
1276
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001277#ifdef Py_DEBUG
1278 assert(!replace || Py_REFCNT(unicode) == 1);
1279#else
1280 if (replace && Py_REFCNT(unicode) != 1)
1281 replace = 0;
1282#endif
1283 if (replace) {
1284 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1285 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1286 /* Optimization for empty strings */
1287 if (len == 0) {
1288 Py_INCREF(unicode_empty);
1289 Py_DECREF(*p_obj);
1290 *p_obj = unicode_empty;
1291 return 0;
1292 }
1293 if (len == 1 && wstr[0] < 256) {
1294 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1295 if (latin1_char == NULL)
1296 return -1;
1297 Py_DECREF(*p_obj);
1298 *p_obj = latin1_char;
1299 return 0;
1300 }
1301 }
1302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001304 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001305 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307
1308 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001309 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1310 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 PyErr_NoMemory();
1312 return -1;
1313 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001314 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 _PyUnicode_WSTR(unicode), end,
1316 PyUnicode_1BYTE_DATA(unicode));
1317 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1318 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1319 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1320 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001321 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001322 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 }
1325 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001326 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001327 _PyUnicode_UTF8(unicode) = NULL;
1328 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 }
1330 PyObject_FREE(_PyUnicode_WSTR(unicode));
1331 _PyUnicode_WSTR(unicode) = NULL;
1332 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1333 }
1334 /* In this case we might have to convert down from 4-byte native
1335 wchar_t to 2-byte unicode. */
1336 else if (maxchar < 65536) {
1337 assert(num_surrogates == 0 &&
1338 "FindMaxCharAndNumSurrogatePairs() messed up");
1339
Victor Stinner506f5922011-09-28 22:34:18 +02001340#if SIZEOF_WCHAR_T == 2
1341 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001342 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001343 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1344 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1345 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001346 _PyUnicode_UTF8(unicode) = NULL;
1347 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001348#else
1349 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001350 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001351 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001352 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001353 PyErr_NoMemory();
1354 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 }
Victor Stinner506f5922011-09-28 22:34:18 +02001356 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1357 _PyUnicode_WSTR(unicode), end,
1358 PyUnicode_2BYTE_DATA(unicode));
1359 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1360 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1361 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001362 _PyUnicode_UTF8(unicode) = NULL;
1363 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001364 PyObject_FREE(_PyUnicode_WSTR(unicode));
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1367#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1370 else {
1371#if SIZEOF_WCHAR_T == 2
1372 /* in case the native representation is 2-bytes, we need to allocate a
1373 new normalized 4-byte version. */
1374 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001375 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1376 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 PyErr_NoMemory();
1378 return -1;
1379 }
1380 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1381 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 _PyUnicode_UTF8(unicode) = NULL;
1383 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001384 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1385 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001386 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 PyObject_FREE(_PyUnicode_WSTR(unicode));
1388 _PyUnicode_WSTR(unicode) = NULL;
1389 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1390#else
1391 assert(num_surrogates == 0);
1392
Victor Stinnerc3c74152011-10-02 20:39:55 +02001393 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001395 _PyUnicode_UTF8(unicode) = NULL;
1396 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1398#endif
1399 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1400 }
1401 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001402 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 return 0;
1404}
1405
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001406int
1407_PyUnicode_ReadyReplace(PyObject **op)
1408{
1409 return unicode_ready(op, 1);
1410}
1411
1412int
1413_PyUnicode_Ready(PyObject *op)
1414{
1415 return unicode_ready(&op, 0);
1416}
1417
Alexander Belopolsky40018472011-02-26 01:02:56 +00001418static void
1419unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420{
Walter Dörwald16807132007-05-25 13:52:07 +00001421 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001422 case SSTATE_NOT_INTERNED:
1423 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001424
Benjamin Peterson29060642009-01-31 22:14:21 +00001425 case SSTATE_INTERNED_MORTAL:
1426 /* revive dead object temporarily for DelItem */
1427 Py_REFCNT(unicode) = 3;
1428 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1429 Py_FatalError(
1430 "deletion of interned string failed");
1431 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001432
Benjamin Peterson29060642009-01-31 22:14:21 +00001433 case SSTATE_INTERNED_IMMORTAL:
1434 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 default:
1437 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001438 }
1439
Victor Stinner03490912011-10-03 23:45:12 +02001440 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001442 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001443 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444
1445 if (PyUnicode_IS_COMPACT(unicode)) {
1446 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 }
1448 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001449 if (_PyUnicode_DATA_ANY(unicode))
1450 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 }
1453}
1454
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001455#ifdef Py_DEBUG
1456static int
1457unicode_is_singleton(PyObject *unicode)
1458{
1459 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1460 if (unicode == unicode_empty)
1461 return 1;
1462 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1463 {
1464 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1465 if (ch < 256 && unicode_latin1[ch] == unicode)
1466 return 1;
1467 }
1468 return 0;
1469}
1470#endif
1471
Alexander Belopolsky40018472011-02-26 01:02:56 +00001472static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001473unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001474{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001475 if (Py_REFCNT(unicode) != 1)
1476 return 0;
1477 if (PyUnicode_CHECK_INTERNED(unicode))
1478 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001479#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001480 /* singleton refcount is greater than 1 */
1481 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001482#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001483 return 1;
1484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001485
Victor Stinnerfe226c02011-10-03 03:52:20 +02001486static int
1487unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1488{
1489 PyObject *unicode;
1490 Py_ssize_t old_length;
1491
1492 assert(p_unicode != NULL);
1493 unicode = *p_unicode;
1494
1495 assert(unicode != NULL);
1496 assert(PyUnicode_Check(unicode));
1497 assert(0 <= length);
1498
Victor Stinner910337b2011-10-03 03:20:16 +02001499 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500 old_length = PyUnicode_WSTR_LENGTH(unicode);
1501 else
1502 old_length = PyUnicode_GET_LENGTH(unicode);
1503 if (old_length == length)
1504 return 0;
1505
Victor Stinnerfe226c02011-10-03 03:52:20 +02001506 if (!unicode_resizable(unicode)) {
1507 PyObject *copy = resize_copy(unicode, length);
1508 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001509 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001510 Py_DECREF(*p_unicode);
1511 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001513 }
1514
Victor Stinnerfe226c02011-10-03 03:52:20 +02001515 if (PyUnicode_IS_COMPACT(unicode)) {
1516 *p_unicode = resize_compact(unicode, length);
1517 if (*p_unicode == NULL)
1518 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001519 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001520 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001521 }
1522 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001523}
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001527{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 PyObject *unicode;
1529 if (p_unicode == NULL) {
1530 PyErr_BadInternalCall();
1531 return -1;
1532 }
1533 unicode = *p_unicode;
1534 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1535 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1536 {
1537 PyErr_BadInternalCall();
1538 return -1;
1539 }
1540 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001541}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543static PyObject*
1544get_latin1_char(unsigned char ch)
1545{
Victor Stinnera464fc12011-10-02 20:39:30 +02001546 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001548 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 if (!unicode)
1550 return NULL;
1551 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001552 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 unicode_latin1[ch] = unicode;
1554 }
1555 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001556 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559PyObject *
1560PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561{
1562 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 Py_UCS4 maxchar = 0;
1564 Py_ssize_t num_surrogates;
1565
1566 if (u == NULL)
1567 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001569 /* If the Unicode data is known at construction time, we can apply
1570 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 /* Optimization for empty strings */
1573 if (size == 0 && unicode_empty != NULL) {
1574 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001575 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001576 }
Tim Petersced69f82003-09-16 20:30:58 +00001577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 /* Single character Unicode objects in the Latin-1 range are
1579 shared when using this constructor */
1580 if (size == 1 && *u < 256)
1581 return get_latin1_char((unsigned char)*u);
1582
1583 /* If not empty and not single character, copy the Unicode data
1584 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001585 if (find_maxchar_surrogates(u, u + size,
1586 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 return NULL;
1588
1589 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1590 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 if (!unicode)
1592 return NULL;
1593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 switch (PyUnicode_KIND(unicode)) {
1595 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001596 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1598 break;
1599 case PyUnicode_2BYTE_KIND:
1600#if Py_UNICODE_SIZE == 2
1601 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1602#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001603 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1605#endif
1606 break;
1607 case PyUnicode_4BYTE_KIND:
1608#if SIZEOF_WCHAR_T == 2
1609 /* This is the only case which has to process surrogates, thus
1610 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001611 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612#else
1613 assert(num_surrogates == 0);
1614 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1615#endif
1616 break;
1617 default:
1618 assert(0 && "Impossible state");
1619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001621 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623}
1624
Alexander Belopolsky40018472011-02-26 01:02:56 +00001625PyObject *
1626PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001627{
1628 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001629
Benjamin Peterson14339b62009-01-31 16:36:08 +00001630 if (size < 0) {
1631 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001633 return NULL;
1634 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001635
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001636 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001637 some optimizations which share commonly used objects.
1638 Also, this means the input must be UTF-8, so fall back to the
1639 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001640 if (u != NULL) {
1641
Benjamin Peterson29060642009-01-31 22:14:21 +00001642 /* Optimization for empty strings */
1643 if (size == 0 && unicode_empty != NULL) {
1644 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001647
1648 /* Single characters are shared when using this constructor.
1649 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 if (size == 1 && Py_CHARMASK(*u) < 128)
1651 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001652
1653 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001654 }
1655
Walter Dörwald55507312007-05-18 13:12:10 +00001656 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001657 if (!unicode)
1658 return NULL;
1659
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001660 return (PyObject *)unicode;
1661}
1662
Alexander Belopolsky40018472011-02-26 01:02:56 +00001663PyObject *
1664PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001665{
1666 size_t size = strlen(u);
1667 if (size > PY_SSIZE_T_MAX) {
1668 PyErr_SetString(PyExc_OverflowError, "input too long");
1669 return NULL;
1670 }
1671
1672 return PyUnicode_FromStringAndSize(u, size);
1673}
1674
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001675PyObject *
1676_PyUnicode_FromId(_Py_Identifier *id)
1677{
1678 if (!id->object) {
1679 id->object = PyUnicode_FromString(id->string);
1680 if (!id->object)
1681 return NULL;
1682 PyUnicode_InternInPlace(&id->object);
1683 assert(!id->next);
1684 id->next = static_strings;
1685 static_strings = id;
1686 }
1687 Py_INCREF(id->object);
1688 return id->object;
1689}
1690
1691void
1692_PyUnicode_ClearStaticStrings()
1693{
1694 _Py_Identifier *i;
1695 for (i = static_strings; i; i = i->next) {
1696 Py_DECREF(i->object);
1697 i->object = NULL;
1698 i->next = NULL;
1699 }
1700}
1701
Victor Stinnere57b1c02011-09-28 22:20:48 +02001702static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001703unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001704{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001705 PyObject *res;
1706#ifdef Py_DEBUG
1707 const unsigned char *p;
1708 const unsigned char *end = s + size;
1709 for (p=s; p < end; p++) {
1710 assert(*p < 128);
1711 }
1712#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001713 if (size == 1)
1714 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001715 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001716 if (!res)
1717 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001718 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001719 return res;
1720}
1721
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001722static Py_UCS4
1723kind_maxchar_limit(unsigned int kind)
1724{
1725 switch(kind) {
1726 case PyUnicode_1BYTE_KIND:
1727 return 0x80;
1728 case PyUnicode_2BYTE_KIND:
1729 return 0x100;
1730 case PyUnicode_4BYTE_KIND:
1731 return 0x10000;
1732 default:
1733 assert(0 && "invalid kind");
1734 return 0x10ffff;
1735 }
1736}
1737
Victor Stinner702c7342011-10-05 13:50:52 +02001738static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001739_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001742 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001743
1744 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001745 if (size == 1)
1746 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001747 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001748 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!res)
1750 return NULL;
1751 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001752 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001754}
1755
Victor Stinnere57b1c02011-09-28 22:20:48 +02001756static PyObject*
1757_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758{
1759 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001760 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001761
1762 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001763 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001764 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001765 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001766 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 if (!res)
1768 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001769 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001771 else {
1772 _PyUnicode_CONVERT_BYTES(
1773 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1774 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001775 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return res;
1777}
1778
Victor Stinnere57b1c02011-09-28 22:20:48 +02001779static PyObject*
1780_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781{
1782 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001784
1785 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001786 if (size == 1 && u[0] < 256)
1787 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001788 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001789 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 if (!res)
1791 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001792 if (max_char < 256)
1793 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1794 PyUnicode_1BYTE_DATA(res));
1795 else if (max_char < 0x10000)
1796 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1797 PyUnicode_2BYTE_DATA(res));
1798 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return res;
1802}
1803
1804PyObject*
1805PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1806{
1807 switch(kind) {
1808 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001809 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001811 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001813 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001814 default:
1815 assert(0 && "invalid kind");
1816 PyErr_SetString(PyExc_SystemError, "invalid kind");
1817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819}
1820
Victor Stinner25a4b292011-10-06 12:31:55 +02001821/* Ensure that a string uses the most efficient storage, if it is not the
1822 case: create a new string with of the right kind. Write NULL into *p_unicode
1823 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001824static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001825unicode_adjust_maxchar(PyObject **p_unicode)
1826{
1827 PyObject *unicode, *copy;
1828 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001829 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001830 unsigned int kind;
1831
1832 assert(p_unicode != NULL);
1833 unicode = *p_unicode;
1834 assert(PyUnicode_IS_READY(unicode));
1835 if (PyUnicode_IS_ASCII(unicode))
1836 return;
1837
1838 len = PyUnicode_GET_LENGTH(unicode);
1839 kind = PyUnicode_KIND(unicode);
1840 if (kind == PyUnicode_1BYTE_KIND) {
1841 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001842 max_char = ucs1lib_find_max_char(u, u + len);
1843 if (max_char >= 128)
1844 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001845 }
1846 else if (kind == PyUnicode_2BYTE_KIND) {
1847 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001848 max_char = ucs2lib_find_max_char(u, u + len);
1849 if (max_char >= 256)
1850 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001851 }
1852 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001853 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001854 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001855 max_char = ucs4lib_find_max_char(u, u + len);
1856 if (max_char >= 0x10000)
1857 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001858 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001859 copy = PyUnicode_New(len, max_char);
1860 copy_characters(copy, 0, unicode, 0, len);
1861 Py_DECREF(unicode);
1862 *p_unicode = copy;
1863}
1864
Victor Stinner034f6cf2011-09-30 02:26:44 +02001865PyObject*
1866PyUnicode_Copy(PyObject *unicode)
1867{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001868 Py_ssize_t size;
1869 PyObject *copy;
1870 void *data;
1871
Victor Stinner034f6cf2011-09-30 02:26:44 +02001872 if (!PyUnicode_Check(unicode)) {
1873 PyErr_BadInternalCall();
1874 return NULL;
1875 }
1876 if (PyUnicode_READY(unicode))
1877 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001878
1879 size = PyUnicode_GET_LENGTH(unicode);
1880 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1881 if (!copy)
1882 return NULL;
1883 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1884
1885 data = PyUnicode_DATA(unicode);
1886 switch (PyUnicode_KIND(unicode))
1887 {
1888 case PyUnicode_1BYTE_KIND:
1889 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1890 break;
1891 case PyUnicode_2BYTE_KIND:
1892 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1893 break;
1894 case PyUnicode_4BYTE_KIND:
1895 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1896 break;
1897 default:
1898 assert(0);
1899 break;
1900 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001901 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001902 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001903}
1904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905
Victor Stinnerbc603d12011-10-02 01:00:40 +02001906/* Widen Unicode objects to larger buffers. Don't write terminating null
1907 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908
1909void*
1910_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1911{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001912 Py_ssize_t len;
1913 void *result;
1914 unsigned int skind;
1915
1916 if (PyUnicode_READY(s))
1917 return NULL;
1918
1919 len = PyUnicode_GET_LENGTH(s);
1920 skind = PyUnicode_KIND(s);
1921 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001922 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return NULL;
1924 }
1925 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001926 case PyUnicode_2BYTE_KIND:
1927 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1928 if (!result)
1929 return PyErr_NoMemory();
1930 assert(skind == PyUnicode_1BYTE_KIND);
1931 _PyUnicode_CONVERT_BYTES(
1932 Py_UCS1, Py_UCS2,
1933 PyUnicode_1BYTE_DATA(s),
1934 PyUnicode_1BYTE_DATA(s) + len,
1935 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001937 case PyUnicode_4BYTE_KIND:
1938 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1939 if (!result)
1940 return PyErr_NoMemory();
1941 if (skind == PyUnicode_2BYTE_KIND) {
1942 _PyUnicode_CONVERT_BYTES(
1943 Py_UCS2, Py_UCS4,
1944 PyUnicode_2BYTE_DATA(s),
1945 PyUnicode_2BYTE_DATA(s) + len,
1946 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001948 else {
1949 assert(skind == PyUnicode_1BYTE_KIND);
1950 _PyUnicode_CONVERT_BYTES(
1951 Py_UCS1, Py_UCS4,
1952 PyUnicode_1BYTE_DATA(s),
1953 PyUnicode_1BYTE_DATA(s) + len,
1954 result);
1955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001957 default:
1958 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 }
Victor Stinner01698042011-10-04 00:04:26 +02001960 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 return NULL;
1962}
1963
1964static Py_UCS4*
1965as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1966 int copy_null)
1967{
1968 int kind;
1969 void *data;
1970 Py_ssize_t len, targetlen;
1971 if (PyUnicode_READY(string) == -1)
1972 return NULL;
1973 kind = PyUnicode_KIND(string);
1974 data = PyUnicode_DATA(string);
1975 len = PyUnicode_GET_LENGTH(string);
1976 targetlen = len;
1977 if (copy_null)
1978 targetlen++;
1979 if (!target) {
1980 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1981 PyErr_NoMemory();
1982 return NULL;
1983 }
1984 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1985 if (!target) {
1986 PyErr_NoMemory();
1987 return NULL;
1988 }
1989 }
1990 else {
1991 if (targetsize < targetlen) {
1992 PyErr_Format(PyExc_SystemError,
1993 "string is longer than the buffer");
1994 if (copy_null && 0 < targetsize)
1995 target[0] = 0;
1996 return NULL;
1997 }
1998 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02001999 if (kind == PyUnicode_1BYTE_KIND) {
2000 Py_UCS1 *start = (Py_UCS1 *) data;
2001 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002003 else if (kind == PyUnicode_2BYTE_KIND) {
2004 Py_UCS2 *start = (Py_UCS2 *) data;
2005 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2006 }
2007 else {
2008 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (copy_null)
2012 target[len] = 0;
2013 return target;
2014}
2015
2016Py_UCS4*
2017PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2018 int copy_null)
2019{
2020 if (target == NULL || targetsize < 1) {
2021 PyErr_BadInternalCall();
2022 return NULL;
2023 }
2024 return as_ucs4(string, target, targetsize, copy_null);
2025}
2026
2027Py_UCS4*
2028PyUnicode_AsUCS4Copy(PyObject *string)
2029{
2030 return as_ucs4(string, NULL, 0, 1);
2031}
2032
2033#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002034
Alexander Belopolsky40018472011-02-26 01:02:56 +00002035PyObject *
2036PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002039 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 PyErr_BadInternalCall();
2042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 }
2044
Martin v. Löwis790465f2008-04-05 20:41:37 +00002045 if (size == -1) {
2046 size = wcslen(w);
2047 }
2048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002053
Walter Dörwald346737f2007-05-31 10:44:43 +00002054static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002055makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2056 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 *fmt++ = '%';
2059 if (width) {
2060 if (zeropad)
2061 *fmt++ = '0';
2062 fmt += sprintf(fmt, "%d", width);
2063 }
2064 if (precision)
2065 fmt += sprintf(fmt, ".%d", precision);
2066 if (longflag)
2067 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002068 else if (longlongflag) {
2069 /* longlongflag should only ever be nonzero on machines with
2070 HAVE_LONG_LONG defined */
2071#ifdef HAVE_LONG_LONG
2072 char *f = PY_FORMAT_LONG_LONG;
2073 while (*f)
2074 *fmt++ = *f++;
2075#else
2076 /* we shouldn't ever get here */
2077 assert(0);
2078 *fmt++ = 'l';
2079#endif
2080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 else if (size_tflag) {
2082 char *f = PY_FORMAT_SIZE_T;
2083 while (*f)
2084 *fmt++ = *f++;
2085 }
2086 *fmt++ = c;
2087 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002088}
2089
Victor Stinner96865452011-03-01 23:44:09 +00002090/* helper for PyUnicode_FromFormatV() */
2091
2092static const char*
2093parse_format_flags(const char *f,
2094 int *p_width, int *p_precision,
2095 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2096{
2097 int width, precision, longflag, longlongflag, size_tflag;
2098
2099 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2100 f++;
2101 width = 0;
2102 while (Py_ISDIGIT((unsigned)*f))
2103 width = (width*10) + *f++ - '0';
2104 precision = 0;
2105 if (*f == '.') {
2106 f++;
2107 while (Py_ISDIGIT((unsigned)*f))
2108 precision = (precision*10) + *f++ - '0';
2109 if (*f == '%') {
2110 /* "%.3%s" => f points to "3" */
2111 f--;
2112 }
2113 }
2114 if (*f == '\0') {
2115 /* bogus format "%.1" => go backward, f points to "1" */
2116 f--;
2117 }
2118 if (p_width != NULL)
2119 *p_width = width;
2120 if (p_precision != NULL)
2121 *p_precision = precision;
2122
2123 /* Handle %ld, %lu, %lld and %llu. */
2124 longflag = 0;
2125 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002126 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002127
2128 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002129 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002130 longflag = 1;
2131 ++f;
2132 }
2133#ifdef HAVE_LONG_LONG
2134 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002135 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002136 longlongflag = 1;
2137 f += 2;
2138 }
2139#endif
2140 }
2141 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002142 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002143 size_tflag = 1;
2144 ++f;
2145 }
2146 if (p_longflag != NULL)
2147 *p_longflag = longflag;
2148 if (p_longlongflag != NULL)
2149 *p_longlongflag = longlongflag;
2150 if (p_size_tflag != NULL)
2151 *p_size_tflag = size_tflag;
2152 return f;
2153}
2154
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002155/* maximum number of characters required for output of %ld. 21 characters
2156 allows for 64-bit integers (in decimal) and an optional sign. */
2157#define MAX_LONG_CHARS 21
2158/* maximum number of characters required for output of %lld.
2159 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2160 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2161#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2162
Walter Dörwaldd2034312007-05-18 16:29:38 +00002163PyObject *
2164PyUnicode_FromFormatV(const char *format, va_list vargs)
2165{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 va_list count;
2167 Py_ssize_t callcount = 0;
2168 PyObject **callresults = NULL;
2169 PyObject **callresult = NULL;
2170 Py_ssize_t n = 0;
2171 int width = 0;
2172 int precision = 0;
2173 int zeropad;
2174 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002175 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002176 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002177 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2179 Py_UCS4 argmaxchar;
2180 Py_ssize_t numbersize = 0;
2181 char *numberresults = NULL;
2182 char *numberresult = NULL;
2183 Py_ssize_t i;
2184 int kind;
2185 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002186
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002187 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002188 /* step 1: count the number of %S/%R/%A/%s format specifications
2189 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2190 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002192 * also estimate a upper bound for all the number formats in the string,
2193 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 for (f = format; *f; f++) {
2196 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002197 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2199 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2200 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2201 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002204#ifdef HAVE_LONG_LONG
2205 if (longlongflag) {
2206 if (width < MAX_LONG_LONG_CHARS)
2207 width = MAX_LONG_LONG_CHARS;
2208 }
2209 else
2210#endif
2211 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2212 including sign. Decimal takes the most space. This
2213 isn't enough for octal. If a width is specified we
2214 need more (which we allocate later). */
2215 if (width < MAX_LONG_CHARS)
2216 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217
2218 /* account for the size + '\0' to separate numbers
2219 inside of the numberresults buffer */
2220 numbersize += (width + 1);
2221 }
2222 }
2223 else if ((unsigned char)*f > 127) {
2224 PyErr_Format(PyExc_ValueError,
2225 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2226 "string, got a non-ASCII byte: 0x%02x",
2227 (unsigned char)*f);
2228 return NULL;
2229 }
2230 }
2231 /* step 2: allocate memory for the results of
2232 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2233 if (callcount) {
2234 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2235 if (!callresults) {
2236 PyErr_NoMemory();
2237 return NULL;
2238 }
2239 callresult = callresults;
2240 }
2241 /* step 2.5: allocate memory for the results of formating numbers */
2242 if (numbersize) {
2243 numberresults = PyObject_Malloc(numbersize);
2244 if (!numberresults) {
2245 PyErr_NoMemory();
2246 goto fail;
2247 }
2248 numberresult = numberresults;
2249 }
2250
2251 /* step 3: format numbers and figure out how large a buffer we need */
2252 for (f = format; *f; f++) {
2253 if (*f == '%') {
2254 const char* p;
2255 int longflag;
2256 int longlongflag;
2257 int size_tflag;
2258 int numprinted;
2259
2260 p = f;
2261 zeropad = (f[1] == '0');
2262 f = parse_format_flags(f, &width, &precision,
2263 &longflag, &longlongflag, &size_tflag);
2264 switch (*f) {
2265 case 'c':
2266 {
2267 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002268 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 n++;
2270 break;
2271 }
2272 case '%':
2273 n++;
2274 break;
2275 case 'i':
2276 case 'd':
2277 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2278 width, precision, *f);
2279 if (longflag)
2280 numprinted = sprintf(numberresult, fmt,
2281 va_arg(count, long));
2282#ifdef HAVE_LONG_LONG
2283 else if (longlongflag)
2284 numprinted = sprintf(numberresult, fmt,
2285 va_arg(count, PY_LONG_LONG));
2286#endif
2287 else if (size_tflag)
2288 numprinted = sprintf(numberresult, fmt,
2289 va_arg(count, Py_ssize_t));
2290 else
2291 numprinted = sprintf(numberresult, fmt,
2292 va_arg(count, int));
2293 n += numprinted;
2294 /* advance by +1 to skip over the '\0' */
2295 numberresult += (numprinted + 1);
2296 assert(*(numberresult - 1) == '\0');
2297 assert(*(numberresult - 2) != '\0');
2298 assert(numprinted >= 0);
2299 assert(numberresult <= numberresults + numbersize);
2300 break;
2301 case 'u':
2302 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2303 width, precision, 'u');
2304 if (longflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, unsigned long));
2307#ifdef HAVE_LONG_LONG
2308 else if (longlongflag)
2309 numprinted = sprintf(numberresult, fmt,
2310 va_arg(count, unsigned PY_LONG_LONG));
2311#endif
2312 else if (size_tflag)
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, size_t));
2315 else
2316 numprinted = sprintf(numberresult, fmt,
2317 va_arg(count, unsigned int));
2318 n += numprinted;
2319 numberresult += (numprinted + 1);
2320 assert(*(numberresult - 1) == '\0');
2321 assert(*(numberresult - 2) != '\0');
2322 assert(numprinted >= 0);
2323 assert(numberresult <= numberresults + numbersize);
2324 break;
2325 case 'x':
2326 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2327 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2328 n += numprinted;
2329 numberresult += (numprinted + 1);
2330 assert(*(numberresult - 1) == '\0');
2331 assert(*(numberresult - 2) != '\0');
2332 assert(numprinted >= 0);
2333 assert(numberresult <= numberresults + numbersize);
2334 break;
2335 case 'p':
2336 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2337 /* %p is ill-defined: ensure leading 0x. */
2338 if (numberresult[1] == 'X')
2339 numberresult[1] = 'x';
2340 else if (numberresult[1] != 'x') {
2341 memmove(numberresult + 2, numberresult,
2342 strlen(numberresult) + 1);
2343 numberresult[0] = '0';
2344 numberresult[1] = 'x';
2345 numprinted += 2;
2346 }
2347 n += numprinted;
2348 numberresult += (numprinted + 1);
2349 assert(*(numberresult - 1) == '\0');
2350 assert(*(numberresult - 2) != '\0');
2351 assert(numprinted >= 0);
2352 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 break;
2354 case 's':
2355 {
2356 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002357 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002358 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2359 if (!str)
2360 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 /* since PyUnicode_DecodeUTF8 returns already flexible
2362 unicode objects, there is no need to call ready on them */
2363 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002364 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002366 /* Remember the str and switch to the next slot */
2367 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002368 break;
2369 }
2370 case 'U':
2371 {
2372 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002373 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 if (PyUnicode_READY(obj) == -1)
2375 goto fail;
2376 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002377 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 break;
2380 }
2381 case 'V':
2382 {
2383 PyObject *obj = va_arg(count, PyObject *);
2384 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002385 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002387 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002388 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 if (PyUnicode_READY(obj) == -1)
2390 goto fail;
2391 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002392 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002394 *callresult++ = NULL;
2395 }
2396 else {
2397 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2398 if (!str_obj)
2399 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002400 if (PyUnicode_READY(str_obj)) {
2401 Py_DECREF(str_obj);
2402 goto fail;
2403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002405 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 *callresult++ = str_obj;
2408 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 break;
2410 }
2411 case 'S':
2412 {
2413 PyObject *obj = va_arg(count, PyObject *);
2414 PyObject *str;
2415 assert(obj);
2416 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002418 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002420 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 /* Remember the str and switch to the next slot */
2423 *callresult++ = str;
2424 break;
2425 }
2426 case 'R':
2427 {
2428 PyObject *obj = va_arg(count, PyObject *);
2429 PyObject *repr;
2430 assert(obj);
2431 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002435 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002437 /* Remember the repr and switch to the next slot */
2438 *callresult++ = repr;
2439 break;
2440 }
2441 case 'A':
2442 {
2443 PyObject *obj = va_arg(count, PyObject *);
2444 PyObject *ascii;
2445 assert(obj);
2446 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002450 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002452 /* Remember the repr and switch to the next slot */
2453 *callresult++ = ascii;
2454 break;
2455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 default:
2457 /* if we stumble upon an unknown
2458 formatting code, copy the rest of
2459 the format string to the output
2460 string. (we cannot just skip the
2461 code, since there's no way to know
2462 what's in the argument list) */
2463 n += strlen(p);
2464 goto expand;
2465 }
2466 } else
2467 n++;
2468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002469 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 we don't have to resize the string.
2473 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002474 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002475 if (!string)
2476 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 kind = PyUnicode_KIND(string);
2478 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002484 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002485
2486 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2488 /* checking for == because the last argument could be a empty
2489 string, which causes i to point to end, the assert at the end of
2490 the loop */
2491 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002492
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 switch (*f) {
2494 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002495 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 const int ordinal = va_arg(vargs, int);
2497 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002498 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002499 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002500 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002502 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 case 'p':
2505 /* unused, since we already have the result */
2506 if (*f == 'p')
2507 (void) va_arg(vargs, void *);
2508 else
2509 (void) va_arg(vargs, int);
2510 /* extract the result from numberresults and append. */
2511 for (; *numberresult; ++i, ++numberresult)
2512 PyUnicode_WRITE(kind, data, i, *numberresult);
2513 /* skip over the separating '\0' */
2514 assert(*numberresult == '\0');
2515 numberresult++;
2516 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002517 break;
2518 case 's':
2519 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002520 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002522 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 size = PyUnicode_GET_LENGTH(*callresult);
2524 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002525 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002527 /* We're done with the unicode()/repr() => forget it */
2528 Py_DECREF(*callresult);
2529 /* switch to next unicode()/repr() result */
2530 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 break;
2532 }
2533 case 'U':
2534 {
2535 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 Py_ssize_t size;
2537 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2538 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002539 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 break;
2542 }
2543 case 'V':
2544 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002547 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 size = PyUnicode_GET_LENGTH(obj);
2550 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002551 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 size = PyUnicode_GET_LENGTH(*callresult);
2555 assert(PyUnicode_KIND(*callresult) <=
2556 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002557 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002559 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002561 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 break;
2563 }
2564 case 'S':
2565 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002566 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002568 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 /* unused, since we already have the result */
2570 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002572 copy_characters(string, i, *callresult, 0, size);
2573 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 /* We're done with the unicode()/repr() => forget it */
2575 Py_DECREF(*callresult);
2576 /* switch to next unicode()/repr() result */
2577 ++callresult;
2578 break;
2579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 for (; *p; ++p, ++i)
2585 PyUnicode_WRITE(kind, data, i, *p);
2586 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 goto end;
2588 }
Victor Stinner1205f272010-09-11 00:54:47 +00002589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 else {
2591 assert(i < PyUnicode_GET_LENGTH(string));
2592 PyUnicode_WRITE(kind, data, i++, *f);
2593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002596
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 if (callresults)
2599 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (numberresults)
2601 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002602 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 if (callresults) {
2606 PyObject **callresult2 = callresults;
2607 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002608 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 ++callresult2;
2610 }
2611 PyObject_Free(callresults);
2612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 if (numberresults)
2614 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002616}
2617
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618PyObject *
2619PyUnicode_FromFormat(const char *format, ...)
2620{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 PyObject* ret;
2622 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002623
2624#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002625 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002626#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002628#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 ret = PyUnicode_FromFormatV(format, vargs);
2630 va_end(vargs);
2631 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002632}
2633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634#ifdef HAVE_WCHAR_H
2635
Victor Stinner5593d8a2010-10-02 11:11:27 +00002636/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2637 convert a Unicode object to a wide character string.
2638
Victor Stinnerd88d9832011-09-06 02:00:05 +02002639 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002640 character) required to convert the unicode object. Ignore size argument.
2641
Victor Stinnerd88d9832011-09-06 02:00:05 +02002642 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002643 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002644 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002645static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002646unicode_aswidechar(PyUnicodeObject *unicode,
2647 wchar_t *w,
2648 Py_ssize_t size)
2649{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002650 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 const wchar_t *wstr;
2652
2653 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2654 if (wstr == NULL)
2655 return -1;
2656
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658 if (size > res)
2659 size = res + 1;
2660 else
2661 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002663 return res;
2664 }
2665 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002667}
2668
2669Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002670PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002671 wchar_t *w,
2672 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673{
2674 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 PyErr_BadInternalCall();
2676 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002678 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679}
2680
Victor Stinner137c34c2010-09-29 10:25:54 +00002681wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002682PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002683 Py_ssize_t *size)
2684{
2685 wchar_t* buffer;
2686 Py_ssize_t buflen;
2687
2688 if (unicode == NULL) {
2689 PyErr_BadInternalCall();
2690 return NULL;
2691 }
2692
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002693 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (buflen == -1)
2695 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002696 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002697 PyErr_NoMemory();
2698 return NULL;
2699 }
2700
Victor Stinner137c34c2010-09-29 10:25:54 +00002701 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2702 if (buffer == NULL) {
2703 PyErr_NoMemory();
2704 return NULL;
2705 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002706 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 if (buflen == -1)
2708 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709 if (size != NULL)
2710 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002711 return buffer;
2712}
2713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
Alexander Belopolsky40018472011-02-26 01:02:56 +00002716PyObject *
2717PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002720 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 PyErr_SetString(PyExc_ValueError,
2722 "chr() arg not in range(0x110000)");
2723 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002724 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 if (ordinal < 256)
2727 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 v = PyUnicode_New(1, ordinal);
2730 if (v == NULL)
2731 return NULL;
2732 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002733 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735}
2736
Alexander Belopolsky40018472011-02-26 01:02:56 +00002737PyObject *
2738PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002740 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002742 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002743 if (PyUnicode_READY(obj))
2744 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002745 Py_INCREF(obj);
2746 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002747 }
2748 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 /* For a Unicode subtype that's not a Unicode object,
2750 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002751 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002752 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002753 PyErr_Format(PyExc_TypeError,
2754 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002755 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002756 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002761 const char *encoding,
2762 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002763{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002764 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002765 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002766
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 PyErr_BadInternalCall();
2769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002771
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002772 /* Decoding bytes objects is the most common case and should be fast */
2773 if (PyBytes_Check(obj)) {
2774 if (PyBytes_GET_SIZE(obj) == 0) {
2775 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002776 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002777 }
2778 else {
2779 v = PyUnicode_Decode(
2780 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2781 encoding, errors);
2782 }
2783 return v;
2784 }
2785
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002786 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 PyErr_SetString(PyExc_TypeError,
2788 "decoding str is not supported");
2789 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002792 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2793 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2794 PyErr_Format(PyExc_TypeError,
2795 "coercing to str: need bytes, bytearray "
2796 "or buffer-like object, %.80s found",
2797 Py_TYPE(obj)->tp_name);
2798 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002799 }
Tim Petersced69f82003-09-16 20:30:58 +00002800
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002801 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002803 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Tim Petersced69f82003-09-16 20:30:58 +00002805 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002806 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002807
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002809 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810}
2811
Victor Stinner600d3be2010-06-10 12:00:55 +00002812/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002813 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2814 1 on success. */
2815static int
2816normalize_encoding(const char *encoding,
2817 char *lower,
2818 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002820 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002821 char *l;
2822 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002824 e = encoding;
2825 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002826 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002827 while (*e) {
2828 if (l == l_end)
2829 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002830 if (Py_ISUPPER(*e)) {
2831 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002832 }
2833 else if (*e == '_') {
2834 *l++ = '-';
2835 e++;
2836 }
2837 else {
2838 *l++ = *e++;
2839 }
2840 }
2841 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002842 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843}
2844
Alexander Belopolsky40018472011-02-26 01:02:56 +00002845PyObject *
2846PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002847 Py_ssize_t size,
2848 const char *encoding,
2849 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002850{
2851 PyObject *buffer = NULL, *unicode;
2852 Py_buffer info;
2853 char lower[11]; /* Enough for any encoding shortcut */
2854
2855 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002856 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002857
2858 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002859 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002860 if ((strcmp(lower, "utf-8") == 0) ||
2861 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002862 return PyUnicode_DecodeUTF8(s, size, errors);
2863 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002864 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002865 (strcmp(lower, "iso-8859-1") == 0))
2866 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002867#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002868 else if (strcmp(lower, "mbcs") == 0)
2869 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002870#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002871 else if (strcmp(lower, "ascii") == 0)
2872 return PyUnicode_DecodeASCII(s, size, errors);
2873 else if (strcmp(lower, "utf-16") == 0)
2874 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2875 else if (strcmp(lower, "utf-32") == 0)
2876 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878
2879 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002880 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002881 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002882 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002883 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 if (buffer == NULL)
2885 goto onError;
2886 unicode = PyCodec_Decode(buffer, encoding, errors);
2887 if (unicode == NULL)
2888 goto onError;
2889 if (!PyUnicode_Check(unicode)) {
2890 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002891 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002892 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 Py_DECREF(unicode);
2894 goto onError;
2895 }
2896 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002897#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002898 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 Py_DECREF(unicode);
2900 return NULL;
2901 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002902#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002903 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002905
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 Py_XDECREF(buffer);
2908 return NULL;
2909}
2910
Alexander Belopolsky40018472011-02-26 01:02:56 +00002911PyObject *
2912PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002913 const char *encoding,
2914 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002915{
2916 PyObject *v;
2917
2918 if (!PyUnicode_Check(unicode)) {
2919 PyErr_BadArgument();
2920 goto onError;
2921 }
2922
2923 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002925
2926 /* Decode via the codec registry */
2927 v = PyCodec_Decode(unicode, encoding, errors);
2928 if (v == NULL)
2929 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002930 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002931 return v;
2932
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002934 return NULL;
2935}
2936
Alexander Belopolsky40018472011-02-26 01:02:56 +00002937PyObject *
2938PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002939 const char *encoding,
2940 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002941{
2942 PyObject *v;
2943
2944 if (!PyUnicode_Check(unicode)) {
2945 PyErr_BadArgument();
2946 goto onError;
2947 }
2948
2949 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002951
2952 /* Decode via the codec registry */
2953 v = PyCodec_Decode(unicode, encoding, errors);
2954 if (v == NULL)
2955 goto onError;
2956 if (!PyUnicode_Check(v)) {
2957 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002958 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002959 Py_TYPE(v)->tp_name);
2960 Py_DECREF(v);
2961 goto onError;
2962 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002963 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002964 return v;
2965
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002967 return NULL;
2968}
2969
Alexander Belopolsky40018472011-02-26 01:02:56 +00002970PyObject *
2971PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002972 Py_ssize_t size,
2973 const char *encoding,
2974 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975{
2976 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002977
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 unicode = PyUnicode_FromUnicode(s, size);
2979 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2982 Py_DECREF(unicode);
2983 return v;
2984}
2985
Alexander Belopolsky40018472011-02-26 01:02:56 +00002986PyObject *
2987PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002988 const char *encoding,
2989 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002990{
2991 PyObject *v;
2992
2993 if (!PyUnicode_Check(unicode)) {
2994 PyErr_BadArgument();
2995 goto onError;
2996 }
2997
2998 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000
3001 /* Encode via the codec registry */
3002 v = PyCodec_Encode(unicode, encoding, errors);
3003 if (v == NULL)
3004 goto onError;
3005 return v;
3006
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008 return NULL;
3009}
3010
Victor Stinnerad158722010-10-27 00:25:46 +00003011PyObject *
3012PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003013{
Victor Stinner99b95382011-07-04 14:23:54 +02003014#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003015 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3016 PyUnicode_GET_SIZE(unicode),
3017 NULL);
3018#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003019 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003020#else
Victor Stinner793b5312011-04-27 00:24:21 +02003021 PyInterpreterState *interp = PyThreadState_GET()->interp;
3022 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3023 cannot use it to encode and decode filenames before it is loaded. Load
3024 the Python codec requires to encode at least its own filename. Use the C
3025 version of the locale codec until the codec registry is initialized and
3026 the Python codec is loaded.
3027
3028 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3029 cannot only rely on it: check also interp->fscodec_initialized for
3030 subinterpreters. */
3031 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003032 return PyUnicode_AsEncodedString(unicode,
3033 Py_FileSystemDefaultEncoding,
3034 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003035 }
3036 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003037 /* locale encoding with surrogateescape */
3038 wchar_t *wchar;
3039 char *bytes;
3040 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003041 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003042
3043 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3044 if (wchar == NULL)
3045 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003046 bytes = _Py_wchar2char(wchar, &error_pos);
3047 if (bytes == NULL) {
3048 if (error_pos != (size_t)-1) {
3049 char *errmsg = strerror(errno);
3050 PyObject *exc = NULL;
3051 if (errmsg == NULL)
3052 errmsg = "Py_wchar2char() failed";
3053 raise_encode_exception(&exc,
3054 "filesystemencoding",
3055 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3056 error_pos, error_pos+1,
3057 errmsg);
3058 Py_XDECREF(exc);
3059 }
3060 else
3061 PyErr_NoMemory();
3062 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003063 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 }
3065 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003066
3067 bytes_obj = PyBytes_FromString(bytes);
3068 PyMem_Free(bytes);
3069 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003070 }
Victor Stinnerad158722010-10-27 00:25:46 +00003071#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 const char *encoding,
3077 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078{
3079 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003080 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 if (!PyUnicode_Check(unicode)) {
3083 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 }
Fred Drakee4315f52000-05-09 19:53:39 +00003086
Victor Stinner2f283c22011-03-02 01:21:46 +00003087 if (encoding == NULL) {
3088 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003089 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003090 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003091 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003092 }
Fred Drakee4315f52000-05-09 19:53:39 +00003093
3094 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003095 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003096 if ((strcmp(lower, "utf-8") == 0) ||
3097 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003098 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003099 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003100 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003101 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003102 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003103 }
Victor Stinner37296e82010-06-10 13:36:23 +00003104 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003105 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003106 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003107 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003108#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003109 else if (strcmp(lower, "mbcs") == 0)
3110 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3111 PyUnicode_GET_SIZE(unicode),
3112 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003113#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003114 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117
3118 /* Encode via the codec registry */
3119 v = PyCodec_Encode(unicode, encoding, errors);
3120 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003121 return NULL;
3122
3123 /* The normal path */
3124 if (PyBytes_Check(v))
3125 return v;
3126
3127 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003128 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003129 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003130 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003131
3132 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3133 "encoder %s returned bytearray instead of bytes",
3134 encoding);
3135 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003136 Py_DECREF(v);
3137 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003138 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003139
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003140 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3141 Py_DECREF(v);
3142 return b;
3143 }
3144
3145 PyErr_Format(PyExc_TypeError,
3146 "encoder did not return a bytes object (type=%.400s)",
3147 Py_TYPE(v)->tp_name);
3148 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003149 return NULL;
3150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003156{
3157 PyObject *v;
3158
3159 if (!PyUnicode_Check(unicode)) {
3160 PyErr_BadArgument();
3161 goto onError;
3162 }
3163
3164 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003166
3167 /* Encode via the codec registry */
3168 v = PyCodec_Encode(unicode, encoding, errors);
3169 if (v == NULL)
3170 goto onError;
3171 if (!PyUnicode_Check(v)) {
3172 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003173 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003174 Py_TYPE(v)->tp_name);
3175 Py_DECREF(v);
3176 goto onError;
3177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003179
Benjamin Peterson29060642009-01-31 22:14:21 +00003180 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181 return NULL;
3182}
3183
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003184PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003185PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003186 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003187 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3188}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003189
Christian Heimes5894ba72007-11-04 11:43:14 +00003190PyObject*
3191PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3192{
Victor Stinner99b95382011-07-04 14:23:54 +02003193#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003194 return PyUnicode_DecodeMBCS(s, size, NULL);
3195#elif defined(__APPLE__)
3196 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3197#else
Victor Stinner793b5312011-04-27 00:24:21 +02003198 PyInterpreterState *interp = PyThreadState_GET()->interp;
3199 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3200 cannot use it to encode and decode filenames before it is loaded. Load
3201 the Python codec requires to encode at least its own filename. Use the C
3202 version of the locale codec until the codec registry is initialized and
3203 the Python codec is loaded.
3204
3205 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3206 cannot only rely on it: check also interp->fscodec_initialized for
3207 subinterpreters. */
3208 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003209 return PyUnicode_Decode(s, size,
3210 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003211 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003212 }
3213 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003214 /* locale encoding with surrogateescape */
3215 wchar_t *wchar;
3216 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003217 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003218
3219 if (s[size] != '\0' || size != strlen(s)) {
3220 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3221 return NULL;
3222 }
3223
Victor Stinner168e1172010-10-16 23:16:16 +00003224 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003225 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003226 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003227
Victor Stinner168e1172010-10-16 23:16:16 +00003228 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003229 PyMem_Free(wchar);
3230 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003231 }
Victor Stinnerad158722010-10-27 00:25:46 +00003232#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003233}
3234
Martin v. Löwis011e8422009-05-05 04:43:17 +00003235
3236int
3237PyUnicode_FSConverter(PyObject* arg, void* addr)
3238{
3239 PyObject *output = NULL;
3240 Py_ssize_t size;
3241 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003242 if (arg == NULL) {
3243 Py_DECREF(*(PyObject**)addr);
3244 return 1;
3245 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003246 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003247 output = arg;
3248 Py_INCREF(output);
3249 }
3250 else {
3251 arg = PyUnicode_FromObject(arg);
3252 if (!arg)
3253 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003254 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003255 Py_DECREF(arg);
3256 if (!output)
3257 return 0;
3258 if (!PyBytes_Check(output)) {
3259 Py_DECREF(output);
3260 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3261 return 0;
3262 }
3263 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003264 size = PyBytes_GET_SIZE(output);
3265 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003266 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003267 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003268 Py_DECREF(output);
3269 return 0;
3270 }
3271 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003272 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003273}
3274
3275
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003276int
3277PyUnicode_FSDecoder(PyObject* arg, void* addr)
3278{
3279 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003280 if (arg == NULL) {
3281 Py_DECREF(*(PyObject**)addr);
3282 return 1;
3283 }
3284 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285 if (PyUnicode_READY(arg))
3286 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003287 output = arg;
3288 Py_INCREF(output);
3289 }
3290 else {
3291 arg = PyBytes_FromObject(arg);
3292 if (!arg)
3293 return 0;
3294 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3295 PyBytes_GET_SIZE(arg));
3296 Py_DECREF(arg);
3297 if (!output)
3298 return 0;
3299 if (!PyUnicode_Check(output)) {
3300 Py_DECREF(output);
3301 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3302 return 0;
3303 }
3304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003305 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3306 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003307 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3308 Py_DECREF(output);
3309 return 0;
3310 }
3311 *(PyObject**)addr = output;
3312 return Py_CLEANUP_SUPPORTED;
3313}
3314
3315
Martin v. Löwis5b222132007-06-10 09:51:05 +00003316char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003318{
Christian Heimesf3863112007-11-22 07:46:41 +00003319 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3321
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003322 if (!PyUnicode_Check(unicode)) {
3323 PyErr_BadArgument();
3324 return NULL;
3325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003326 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003327 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003329 if (PyUnicode_UTF8(unicode) == NULL) {
3330 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3332 if (bytes == NULL)
3333 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003334 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3335 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336 Py_DECREF(bytes);
3337 return NULL;
3338 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003339 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3340 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003341 Py_DECREF(bytes);
3342 }
3343
3344 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003345 *psize = PyUnicode_UTF8_LENGTH(unicode);
3346 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003347}
3348
3349char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003351{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3353}
3354
3355#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003356static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357#endif
3358
3359
3360Py_UNICODE *
3361PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3362{
3363 PyUnicodeObject *u;
3364 const unsigned char *one_byte;
3365#if SIZEOF_WCHAR_T == 4
3366 const Py_UCS2 *two_bytes;
3367#else
3368 const Py_UCS4 *four_bytes;
3369 const Py_UCS4 *ucs4_end;
3370 Py_ssize_t num_surrogates;
3371#endif
3372 wchar_t *w;
3373 wchar_t *wchar_end;
3374
3375 if (!PyUnicode_Check(unicode)) {
3376 PyErr_BadArgument();
3377 return NULL;
3378 }
3379 u = (PyUnicodeObject*)unicode;
3380 if (_PyUnicode_WSTR(u) == NULL) {
3381 /* Non-ASCII compact unicode object */
3382 assert(_PyUnicode_KIND(u) != 0);
3383 assert(PyUnicode_IS_READY(u));
3384
3385#ifdef Py_DEBUG
3386 ++unicode_as_unicode_calls;
3387#endif
3388
3389 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3390#if SIZEOF_WCHAR_T == 2
3391 four_bytes = PyUnicode_4BYTE_DATA(u);
3392 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3393 num_surrogates = 0;
3394
3395 for (; four_bytes < ucs4_end; ++four_bytes) {
3396 if (*four_bytes > 0xFFFF)
3397 ++num_surrogates;
3398 }
3399
3400 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3401 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3402 if (!_PyUnicode_WSTR(u)) {
3403 PyErr_NoMemory();
3404 return NULL;
3405 }
3406 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3407
3408 w = _PyUnicode_WSTR(u);
3409 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3410 four_bytes = PyUnicode_4BYTE_DATA(u);
3411 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3412 if (*four_bytes > 0xFFFF) {
3413 /* encode surrogate pair in this case */
3414 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3415 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3416 }
3417 else
3418 *w = *four_bytes;
3419
3420 if (w > wchar_end) {
3421 assert(0 && "Miscalculated string end");
3422 }
3423 }
3424 *w = 0;
3425#else
3426 /* sizeof(wchar_t) == 4 */
3427 Py_FatalError("Impossible unicode object state, wstr and str "
3428 "should share memory already.");
3429 return NULL;
3430#endif
3431 }
3432 else {
3433 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3434 (_PyUnicode_LENGTH(u) + 1));
3435 if (!_PyUnicode_WSTR(u)) {
3436 PyErr_NoMemory();
3437 return NULL;
3438 }
3439 if (!PyUnicode_IS_COMPACT_ASCII(u))
3440 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3441 w = _PyUnicode_WSTR(u);
3442 wchar_end = w + _PyUnicode_LENGTH(u);
3443
3444 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3445 one_byte = PyUnicode_1BYTE_DATA(u);
3446 for (; w < wchar_end; ++one_byte, ++w)
3447 *w = *one_byte;
3448 /* null-terminate the wstr */
3449 *w = 0;
3450 }
3451 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3452#if SIZEOF_WCHAR_T == 4
3453 two_bytes = PyUnicode_2BYTE_DATA(u);
3454 for (; w < wchar_end; ++two_bytes, ++w)
3455 *w = *two_bytes;
3456 /* null-terminate the wstr */
3457 *w = 0;
3458#else
3459 /* sizeof(wchar_t) == 2 */
3460 PyObject_FREE(_PyUnicode_WSTR(u));
3461 _PyUnicode_WSTR(u) = NULL;
3462 Py_FatalError("Impossible unicode object state, wstr "
3463 "and str should share memory already.");
3464 return NULL;
3465#endif
3466 }
3467 else {
3468 assert(0 && "This should never happen.");
3469 }
3470 }
3471 }
3472 if (size != NULL)
3473 *size = PyUnicode_WSTR_LENGTH(u);
3474 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003475}
3476
Alexander Belopolsky40018472011-02-26 01:02:56 +00003477Py_UNICODE *
3478PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003480 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481}
3482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483
Alexander Belopolsky40018472011-02-26 01:02:56 +00003484Py_ssize_t
3485PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486{
3487 if (!PyUnicode_Check(unicode)) {
3488 PyErr_BadArgument();
3489 goto onError;
3490 }
3491 return PyUnicode_GET_SIZE(unicode);
3492
Benjamin Peterson29060642009-01-31 22:14:21 +00003493 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return -1;
3495}
3496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497Py_ssize_t
3498PyUnicode_GetLength(PyObject *unicode)
3499{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003500 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501 PyErr_BadArgument();
3502 return -1;
3503 }
3504
3505 return PyUnicode_GET_LENGTH(unicode);
3506}
3507
3508Py_UCS4
3509PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3510{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003511 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3512 PyErr_BadArgument();
3513 return (Py_UCS4)-1;
3514 }
3515 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3516 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 return (Py_UCS4)-1;
3518 }
3519 return PyUnicode_READ_CHAR(unicode, index);
3520}
3521
3522int
3523PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3524{
3525 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003526 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 return -1;
3528 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003529 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3530 PyErr_SetString(PyExc_IndexError, "string index out of range");
3531 return -1;
3532 }
3533 if (_PyUnicode_Dirty(unicode))
3534 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003535 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3536 index, ch);
3537 return 0;
3538}
3539
Alexander Belopolsky40018472011-02-26 01:02:56 +00003540const char *
3541PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003542{
Victor Stinner42cb4622010-09-01 19:39:01 +00003543 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003544}
3545
Victor Stinner554f3f02010-06-16 23:33:54 +00003546/* create or adjust a UnicodeDecodeError */
3547static void
3548make_decode_exception(PyObject **exceptionObject,
3549 const char *encoding,
3550 const char *input, Py_ssize_t length,
3551 Py_ssize_t startpos, Py_ssize_t endpos,
3552 const char *reason)
3553{
3554 if (*exceptionObject == NULL) {
3555 *exceptionObject = PyUnicodeDecodeError_Create(
3556 encoding, input, length, startpos, endpos, reason);
3557 }
3558 else {
3559 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3560 goto onError;
3561 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3562 goto onError;
3563 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3564 goto onError;
3565 }
3566 return;
3567
3568onError:
3569 Py_DECREF(*exceptionObject);
3570 *exceptionObject = NULL;
3571}
3572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573/* error handling callback helper:
3574 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003575 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 and adjust various state variables.
3577 return 0 on success, -1 on error
3578*/
3579
Alexander Belopolsky40018472011-02-26 01:02:56 +00003580static int
3581unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003582 const char *encoding, const char *reason,
3583 const char **input, const char **inend, Py_ssize_t *startinpos,
3584 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3585 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003587 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588
3589 PyObject *restuple = NULL;
3590 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003591 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003592 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003593 Py_ssize_t requiredsize;
3594 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 int res = -1;
3599
3600 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 *errorHandler = PyCodec_LookupError(errors);
3602 if (*errorHandler == NULL)
3603 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 }
3605
Victor Stinner554f3f02010-06-16 23:33:54 +00003606 make_decode_exception(exceptionObject,
3607 encoding,
3608 *input, *inend - *input,
3609 *startinpos, *endinpos,
3610 reason);
3611 if (*exceptionObject == NULL)
3612 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613
3614 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3615 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003618 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 }
3621 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003623
3624 /* Copy back the bytes variables, which might have been modified by the
3625 callback */
3626 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3627 if (!inputobj)
3628 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003629 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003631 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003632 *input = PyBytes_AS_STRING(inputobj);
3633 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003634 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003635 /* we can DECREF safely, as the exception has another reference,
3636 so the object won't go away. */
3637 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003641 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3643 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645
3646 /* need more space? (at least enough for what we
3647 have+the replacement+the rest of the string (starting
3648 at the new input position), so we won't have to check space
3649 when there are no errors in the rest of the string) */
3650 repptr = PyUnicode_AS_UNICODE(repunicode);
3651 repsize = PyUnicode_GET_SIZE(repunicode);
3652 requiredsize = *outpos + repsize + insize-newpos;
3653 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 if (requiredsize<2*outsize)
3655 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003656 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 goto onError;
3658 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 }
3660 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 Py_UNICODE_COPY(*outptr, repptr, repsize);
3663 *outptr += repsize;
3664 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 /* we made it! */
3667 res = 0;
3668
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 Py_XDECREF(restuple);
3671 return res;
3672}
3673
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003674/* --- UTF-7 Codec -------------------------------------------------------- */
3675
Antoine Pitrou244651a2009-05-04 18:56:13 +00003676/* See RFC2152 for details. We encode conservatively and decode liberally. */
3677
3678/* Three simple macros defining base-64. */
3679
3680/* Is c a base-64 character? */
3681
3682#define IS_BASE64(c) \
3683 (((c) >= 'A' && (c) <= 'Z') || \
3684 ((c) >= 'a' && (c) <= 'z') || \
3685 ((c) >= '0' && (c) <= '9') || \
3686 (c) == '+' || (c) == '/')
3687
3688/* given that c is a base-64 character, what is its base-64 value? */
3689
3690#define FROM_BASE64(c) \
3691 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3692 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3693 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3694 (c) == '+' ? 62 : 63)
3695
3696/* What is the base-64 character of the bottom 6 bits of n? */
3697
3698#define TO_BASE64(n) \
3699 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3700
3701/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3702 * decoded as itself. We are permissive on decoding; the only ASCII
3703 * byte not decoding to itself is the + which begins a base64
3704 * string. */
3705
3706#define DECODE_DIRECT(c) \
3707 ((c) <= 127 && (c) != '+')
3708
3709/* The UTF-7 encoder treats ASCII characters differently according to
3710 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3711 * the above). See RFC2152. This array identifies these different
3712 * sets:
3713 * 0 : "Set D"
3714 * alphanumeric and '(),-./:?
3715 * 1 : "Set O"
3716 * !"#$%&*;<=>@[]^_`{|}
3717 * 2 : "whitespace"
3718 * ht nl cr sp
3719 * 3 : special (must be base64 encoded)
3720 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3721 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722
Tim Petersced69f82003-09-16 20:30:58 +00003723static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003724char utf7_category[128] = {
3725/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3726 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3727/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3728 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3729/* sp ! " # $ % & ' ( ) * + , - . / */
3730 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3731/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3733/* @ A B C D E F G H I J K L M N O */
3734 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3735/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3737/* ` a b c d e f g h i j k l m n o */
3738 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3739/* p q r s t u v w x y z { | } ~ del */
3740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003741};
3742
Antoine Pitrou244651a2009-05-04 18:56:13 +00003743/* ENCODE_DIRECT: this character should be encoded as itself. The
3744 * answer depends on whether we are encoding set O as itself, and also
3745 * on whether we are encoding whitespace as itself. RFC2152 makes it
3746 * clear that the answers to these questions vary between
3747 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003748
Antoine Pitrou244651a2009-05-04 18:56:13 +00003749#define ENCODE_DIRECT(c, directO, directWS) \
3750 ((c) < 128 && (c) > 0 && \
3751 ((utf7_category[(c)] == 0) || \
3752 (directWS && (utf7_category[(c)] == 2)) || \
3753 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003754
Alexander Belopolsky40018472011-02-26 01:02:56 +00003755PyObject *
3756PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003757 Py_ssize_t size,
3758 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003760 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3761}
3762
Antoine Pitrou244651a2009-05-04 18:56:13 +00003763/* The decoder. The only state we preserve is our read position,
3764 * i.e. how many characters we have consumed. So if we end in the
3765 * middle of a shift sequence we have to back off the read position
3766 * and the output to the beginning of the sequence, otherwise we lose
3767 * all the shift state (seen bits, number of bits seen, high
3768 * surrogate). */
3769
Alexander Belopolsky40018472011-02-26 01:02:56 +00003770PyObject *
3771PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003772 Py_ssize_t size,
3773 const char *errors,
3774 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003777 Py_ssize_t startinpos;
3778 Py_ssize_t endinpos;
3779 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003780 const char *e;
3781 PyUnicodeObject *unicode;
3782 Py_UNICODE *p;
3783 const char *errmsg = "";
3784 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003785 Py_UNICODE *shiftOutStart;
3786 unsigned int base64bits = 0;
3787 unsigned long base64buffer = 0;
3788 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 PyObject *errorHandler = NULL;
3790 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003791
3792 unicode = _PyUnicode_New(size);
3793 if (!unicode)
3794 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003795 if (size == 0) {
3796 if (consumed)
3797 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003798 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003799 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003803 e = s + size;
3804
3805 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003808 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809
Antoine Pitrou244651a2009-05-04 18:56:13 +00003810 if (inShift) { /* in a base-64 section */
3811 if (IS_BASE64(ch)) { /* consume a base-64 character */
3812 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3813 base64bits += 6;
3814 s++;
3815 if (base64bits >= 16) {
3816 /* we have enough bits for a UTF-16 value */
3817 Py_UNICODE outCh = (Py_UNICODE)
3818 (base64buffer >> (base64bits-16));
3819 base64bits -= 16;
3820 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3821 if (surrogate) {
3822 /* expecting a second surrogate */
3823 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3824#ifdef Py_UNICODE_WIDE
3825 *p++ = (((surrogate & 0x3FF)<<10)
3826 | (outCh & 0x3FF)) + 0x10000;
3827#else
3828 *p++ = surrogate;
3829 *p++ = outCh;
3830#endif
3831 surrogate = 0;
3832 }
3833 else {
3834 surrogate = 0;
3835 errmsg = "second surrogate missing";
3836 goto utf7Error;
3837 }
3838 }
3839 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3840 /* first surrogate */
3841 surrogate = outCh;
3842 }
3843 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3844 errmsg = "unexpected second surrogate";
3845 goto utf7Error;
3846 }
3847 else {
3848 *p++ = outCh;
3849 }
3850 }
3851 }
3852 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003853 inShift = 0;
3854 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003855 if (surrogate) {
3856 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003857 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003858 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003859 if (base64bits > 0) { /* left-over bits */
3860 if (base64bits >= 6) {
3861 /* We've seen at least one base-64 character */
3862 errmsg = "partial character in shift sequence";
3863 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865 else {
3866 /* Some bits remain; they should be zero */
3867 if (base64buffer != 0) {
3868 errmsg = "non-zero padding bits in shift sequence";
3869 goto utf7Error;
3870 }
3871 }
3872 }
3873 if (ch != '-') {
3874 /* '-' is absorbed; other terminating
3875 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003876 *p++ = ch;
3877 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 }
3879 }
3880 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 s++; /* consume '+' */
3883 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884 s++;
3885 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003886 }
3887 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003889 shiftOutStart = p;
3890 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 }
3892 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894 *p++ = ch;
3895 s++;
3896 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 else {
3898 startinpos = s-starts;
3899 s++;
3900 errmsg = "unexpected special character";
3901 goto utf7Error;
3902 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003903 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003905 outpos = p-PyUnicode_AS_UNICODE(unicode);
3906 endinpos = s-starts;
3907 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 errors, &errorHandler,
3909 "utf7", errmsg,
3910 &starts, &e, &startinpos, &endinpos, &exc, &s,
3911 &unicode, &outpos, &p))
3912 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003913 }
3914
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 /* end of string */
3916
3917 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3918 /* if we're in an inconsistent state, that's an error */
3919 if (surrogate ||
3920 (base64bits >= 6) ||
3921 (base64bits > 0 && base64buffer != 0)) {
3922 outpos = p-PyUnicode_AS_UNICODE(unicode);
3923 endinpos = size;
3924 if (unicode_decode_call_errorhandler(
3925 errors, &errorHandler,
3926 "utf7", "unterminated shift sequence",
3927 &starts, &e, &startinpos, &endinpos, &exc, &s,
3928 &unicode, &outpos, &p))
3929 goto onError;
3930 if (s < e)
3931 goto restart;
3932 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003933 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934
3935 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003936 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 if (inShift) {
3938 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003939 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 }
3941 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003942 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003943 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003944 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003945
Victor Stinnerfe226c02011-10-03 03:52:20 +02003946 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003947 goto onError;
3948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 Py_XDECREF(errorHandler);
3950 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003951#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003952 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 Py_DECREF(unicode);
3954 return NULL;
3955 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003956#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003958 return (PyObject *)unicode;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_XDECREF(errorHandler);
3962 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 Py_DECREF(unicode);
3964 return NULL;
3965}
3966
3967
Alexander Belopolsky40018472011-02-26 01:02:56 +00003968PyObject *
3969PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003970 Py_ssize_t size,
3971 int base64SetO,
3972 int base64WhiteSpace,
3973 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003975 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003976 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003977 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 unsigned int base64bits = 0;
3981 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 char * out;
3983 char * start;
3984
3985 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003988 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003989 return PyErr_NoMemory();
3990
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 if (v == NULL)
3993 return NULL;
3994
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003995 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003996 for (;i < size; ++i) {
3997 Py_UNICODE ch = s[i];
3998
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999 if (inShift) {
4000 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4001 /* shifting out */
4002 if (base64bits) { /* output remaining bits */
4003 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4004 base64buffer = 0;
4005 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004006 }
4007 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004008 /* Characters not in the BASE64 set implicitly unshift the sequence
4009 so no '-' is required, except if the character is itself a '-' */
4010 if (IS_BASE64(ch) || ch == '-') {
4011 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004012 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004013 *out++ = (char) ch;
4014 }
4015 else {
4016 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004017 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004018 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004019 else { /* not in a shift sequence */
4020 if (ch == '+') {
4021 *out++ = '+';
4022 *out++ = '-';
4023 }
4024 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4025 *out++ = (char) ch;
4026 }
4027 else {
4028 *out++ = '+';
4029 inShift = 1;
4030 goto encode_char;
4031 }
4032 }
4033 continue;
4034encode_char:
4035#ifdef Py_UNICODE_WIDE
4036 if (ch >= 0x10000) {
4037 /* code first surrogate */
4038 base64bits += 16;
4039 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4040 while (base64bits >= 6) {
4041 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4042 base64bits -= 6;
4043 }
4044 /* prepare second surrogate */
4045 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4046 }
4047#endif
4048 base64bits += 16;
4049 base64buffer = (base64buffer << 16) | ch;
4050 while (base64bits >= 6) {
4051 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4052 base64bits -= 6;
4053 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004054 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004055 if (base64bits)
4056 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4057 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004059 if (_PyBytes_Resize(&v, out - start) < 0)
4060 return NULL;
4061 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004062}
4063
Antoine Pitrou244651a2009-05-04 18:56:13 +00004064#undef IS_BASE64
4065#undef FROM_BASE64
4066#undef TO_BASE64
4067#undef DECODE_DIRECT
4068#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070/* --- UTF-8 Codec -------------------------------------------------------- */
4071
Tim Petersced69f82003-09-16 20:30:58 +00004072static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004074 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4075 illegal prefix. See RFC 3629 for details */
4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4081 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004083 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4088 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4089 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4090 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4091 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092};
4093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094PyObject *
4095PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004096 Py_ssize_t size,
4097 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098{
Walter Dörwald69652032004-09-07 20:24:22 +00004099 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4100}
4101
Antoine Pitrouab868312009-01-10 15:40:25 +00004102/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4103#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4104
4105/* Mask to quickly check whether a C 'long' contains a
4106 non-ASCII, UTF8-encoded char. */
4107#if (SIZEOF_LONG == 8)
4108# define ASCII_CHAR_MASK 0x8080808080808080L
4109#elif (SIZEOF_LONG == 4)
4110# define ASCII_CHAR_MASK 0x80808080L
4111#else
4112# error C 'long' size should be either 4 or 8!
4113#endif
4114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115/* Scans a UTF-8 string and returns the maximum character to be expected,
4116 the size of the decoded unicode string and if any major errors were
4117 encountered.
4118
4119 This function does check basic UTF-8 sanity, it does however NOT CHECK
4120 if the string contains surrogates, and if all continuation bytes are
4121 within the correct ranges, these checks are performed in
4122 PyUnicode_DecodeUTF8Stateful.
4123
4124 If it sets has_errors to 1, it means the value of unicode_size and max_char
4125 will be bogus and you should not rely on useful information in them.
4126 */
4127static Py_UCS4
4128utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4129 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4130 int *has_errors)
4131{
4132 Py_ssize_t n;
4133 Py_ssize_t char_count = 0;
4134 Py_UCS4 max_char = 127, new_max;
4135 Py_UCS4 upper_bound;
4136 const unsigned char *p = (const unsigned char *)s;
4137 const unsigned char *end = p + string_size;
4138 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4139 int err = 0;
4140
4141 for (; p < end && !err; ++p, ++char_count) {
4142 /* Only check value if it's not a ASCII char... */
4143 if (*p < 0x80) {
4144 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4145 an explanation. */
4146 if (!((size_t) p & LONG_PTR_MASK)) {
4147 /* Help register allocation */
4148 register const unsigned char *_p = p;
4149 while (_p < aligned_end) {
4150 unsigned long value = *(unsigned long *) _p;
4151 if (value & ASCII_CHAR_MASK)
4152 break;
4153 _p += SIZEOF_LONG;
4154 char_count += SIZEOF_LONG;
4155 }
4156 p = _p;
4157 if (p == end)
4158 break;
4159 }
4160 }
4161 if (*p >= 0x80) {
4162 n = utf8_code_length[*p];
4163 new_max = max_char;
4164 switch (n) {
4165 /* invalid start byte */
4166 case 0:
4167 err = 1;
4168 break;
4169 case 2:
4170 /* Code points between 0x00FF and 0x07FF inclusive.
4171 Approximate the upper bound of the code point,
4172 if this flips over 255 we can be sure it will be more
4173 than 255 and the string will need 2 bytes per code coint,
4174 if it stays under or equal to 255, we can be sure 1 byte
4175 is enough.
4176 ((*p & 0b00011111) << 6) | 0b00111111 */
4177 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4178 if (max_char < upper_bound)
4179 new_max = upper_bound;
4180 /* Ensure we track at least that we left ASCII space. */
4181 if (new_max < 128)
4182 new_max = 128;
4183 break;
4184 case 3:
4185 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4186 always > 255 and <= 65535 and will always need 2 bytes. */
4187 if (max_char < 65535)
4188 new_max = 65535;
4189 break;
4190 case 4:
4191 /* Code point will be above 0xFFFF for sure in this case. */
4192 new_max = 65537;
4193 break;
4194 /* Internal error, this should be caught by the first if */
4195 case 1:
4196 default:
4197 assert(0 && "Impossible case in utf8_max_char_and_size");
4198 err = 1;
4199 }
4200 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004201 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202 --n;
4203 /* Check if the follow up chars are all valid continuation bytes */
4204 if (n >= 1) {
4205 const unsigned char *cont;
4206 if ((p + n) >= end) {
4207 if (consumed == 0)
4208 /* incomplete data, non-incremental decoding */
4209 err = 1;
4210 break;
4211 }
4212 for (cont = p + 1; cont < (p + n); ++cont) {
4213 if ((*cont & 0xc0) != 0x80) {
4214 err = 1;
4215 break;
4216 }
4217 }
4218 p += n;
4219 }
4220 else
4221 err = 1;
4222 max_char = new_max;
4223 }
4224 }
4225
4226 if (unicode_size)
4227 *unicode_size = char_count;
4228 if (has_errors)
4229 *has_errors = err;
4230 return max_char;
4231}
4232
4233/* Similar to PyUnicode_WRITE but can also write into wstr field
4234 of the legacy unicode representation */
4235#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4236 do { \
4237 const int k_ = (kind); \
4238 if (k_ == PyUnicode_WCHAR_KIND) \
4239 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4240 else if (k_ == PyUnicode_1BYTE_KIND) \
4241 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4242 else if (k_ == PyUnicode_2BYTE_KIND) \
4243 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4244 else \
4245 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4246 } while (0)
4247
Alexander Belopolsky40018472011-02-26 01:02:56 +00004248PyObject *
4249PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250 Py_ssize_t size,
4251 const char *errors,
4252 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004256 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004257 Py_ssize_t startinpos;
4258 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004259 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004261 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 PyObject *errorHandler = NULL;
4263 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004264 Py_UCS4 maxchar = 0;
4265 Py_ssize_t unicode_size;
4266 Py_ssize_t i;
4267 int kind;
4268 void *data;
4269 int has_errors;
4270 Py_UNICODE *error_outptr;
4271#if SIZEOF_WCHAR_T == 2
4272 Py_ssize_t wchar_offset = 0;
4273#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274
Walter Dörwald69652032004-09-07 20:24:22 +00004275 if (size == 0) {
4276 if (consumed)
4277 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4281 consumed, &has_errors);
4282 if (has_errors) {
4283 unicode = _PyUnicode_New(size);
4284 if (!unicode)
4285 return NULL;
4286 kind = PyUnicode_WCHAR_KIND;
4287 data = PyUnicode_AS_UNICODE(unicode);
4288 assert(data != NULL);
4289 }
4290 else {
4291 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4292 if (!unicode)
4293 return NULL;
4294 /* When the string is ASCII only, just use memcpy and return.
4295 unicode_size may be != size if there is an incomplete UTF-8
4296 sequence at the end of the ASCII block. */
4297 if (maxchar < 128 && size == unicode_size) {
4298 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4299 return (PyObject *)unicode;
4300 }
4301 kind = PyUnicode_KIND(unicode);
4302 data = PyUnicode_DATA(unicode);
4303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004305 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004307 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308
4309 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004310 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311
4312 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004313 /* Fast path for runs of ASCII characters. Given that common UTF-8
4314 input will consist of an overwhelming majority of ASCII
4315 characters, we try to optimize for this case by checking
4316 as many characters as a C 'long' can contain.
4317 First, check if we can do an aligned read, as most CPUs have
4318 a penalty for unaligned reads.
4319 */
4320 if (!((size_t) s & LONG_PTR_MASK)) {
4321 /* Help register allocation */
4322 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004323 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004324 while (_s < aligned_end) {
4325 /* Read a whole long at a time (either 4 or 8 bytes),
4326 and do a fast unrolled copy if it only contains ASCII
4327 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 unsigned long value = *(unsigned long *) _s;
4329 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004330 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4332 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4333 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4334 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004335#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4337 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4338 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4339 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004340#endif
4341 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004342 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 }
4344 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004346 if (s == e)
4347 break;
4348 ch = (unsigned char)*s;
4349 }
4350 }
4351
4352 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 s++;
4355 continue;
4356 }
4357
4358 n = utf8_code_length[ch];
4359
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004360 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 if (consumed)
4362 break;
4363 else {
4364 errmsg = "unexpected end of data";
4365 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004366 endinpos = startinpos+1;
4367 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4368 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 goto utf8Error;
4370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372
4373 switch (n) {
4374
4375 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004376 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 startinpos = s-starts;
4378 endinpos = startinpos+1;
4379 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380
4381 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004382 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 startinpos = s-starts;
4384 endinpos = startinpos+1;
4385 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
4387 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004388 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004389 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004391 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 goto utf8Error;
4393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004395 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004396 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 break;
4398
4399 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004400 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4401 will result in surrogates in range d800-dfff. Surrogates are
4402 not valid UTF-8 so they are rejected.
4403 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4404 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004405 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 (s[2] & 0xc0) != 0x80 ||
4407 ((unsigned char)s[0] == 0xE0 &&
4408 (unsigned char)s[1] < 0xA0) ||
4409 ((unsigned char)s[0] == 0xED &&
4410 (unsigned char)s[1] > 0x9F)) {
4411 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004413 endinpos = startinpos + 1;
4414
4415 /* if s[1] first two bits are 1 and 0, then the invalid
4416 continuation byte is s[2], so increment endinpos by 1,
4417 if not, s[1] is invalid and endinpos doesn't need to
4418 be incremented. */
4419 if ((s[1] & 0xC0) == 0x80)
4420 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 goto utf8Error;
4422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004424 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004425 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004426 break;
4427
4428 case 4:
4429 if ((s[1] & 0xc0) != 0x80 ||
4430 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004431 (s[3] & 0xc0) != 0x80 ||
4432 ((unsigned char)s[0] == 0xF0 &&
4433 (unsigned char)s[1] < 0x90) ||
4434 ((unsigned char)s[0] == 0xF4 &&
4435 (unsigned char)s[1] > 0x8F)) {
4436 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004438 endinpos = startinpos + 1;
4439 if ((s[1] & 0xC0) == 0x80) {
4440 endinpos++;
4441 if ((s[2] & 0xC0) == 0x80)
4442 endinpos++;
4443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 goto utf8Error;
4445 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004446 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004447 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4448 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450 /* If the string is flexible or we have native UCS-4, write
4451 directly.. */
4452 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4453 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 else {
4456 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004458 /* translate from 10000..10FFFF to 0..FFFF */
4459 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004461 /* high surrogate = top 10 bits added to D800 */
4462 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4463 (Py_UNICODE)(0xD800 + (ch >> 10)));
4464
4465 /* low surrogate = bottom 10 bits added to DC00 */
4466 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4467 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4468 }
4469#if SIZEOF_WCHAR_T == 2
4470 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004471#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 }
4474 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 /* If this is not yet a resizable string, make it one.. */
4479 if (kind != PyUnicode_WCHAR_KIND) {
4480 const Py_UNICODE *u;
4481 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4482 if (!new_unicode)
4483 goto onError;
4484 u = PyUnicode_AsUnicode((PyObject *)unicode);
4485 if (!u)
4486 goto onError;
4487#if SIZEOF_WCHAR_T == 2
4488 i += wchar_offset;
4489#endif
4490 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4491 Py_DECREF(unicode);
4492 unicode = new_unicode;
4493 kind = 0;
4494 data = PyUnicode_AS_UNICODE(new_unicode);
4495 assert(data != NULL);
4496 }
4497 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 if (unicode_decode_call_errorhandler(
4499 errors, &errorHandler,
4500 "utf8", errmsg,
4501 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004502 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004504 /* Update data because unicode_decode_call_errorhandler might have
4505 re-created or resized the unicode object. */
4506 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509 /* Ensure the unicode_size calculation above was correct: */
4510 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4511
Walter Dörwald69652032004-09-07 20:24:22 +00004512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 /* Adjust length and ready string when it contained errors and
4516 is of the old resizable kind. */
4517 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004518 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519 goto onError;
4520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 Py_XDECREF(errorHandler);
4523 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004524#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004525 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 Py_DECREF(unicode);
4527 return NULL;
4528 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004529#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004530 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 return (PyObject *)unicode;
4532
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 Py_XDECREF(errorHandler);
4535 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 Py_DECREF(unicode);
4537 return NULL;
4538}
4539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004541
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004542#ifdef __APPLE__
4543
4544/* Simplified UTF-8 decoder using surrogateescape error handler,
4545 used to decode the command line arguments on Mac OS X. */
4546
4547wchar_t*
4548_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4549{
4550 int n;
4551 const char *e;
4552 wchar_t *unicode, *p;
4553
4554 /* Note: size will always be longer than the resulting Unicode
4555 character count */
4556 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4557 PyErr_NoMemory();
4558 return NULL;
4559 }
4560 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4561 if (!unicode)
4562 return NULL;
4563
4564 /* Unpack UTF-8 encoded data */
4565 p = unicode;
4566 e = s + size;
4567 while (s < e) {
4568 Py_UCS4 ch = (unsigned char)*s;
4569
4570 if (ch < 0x80) {
4571 *p++ = (wchar_t)ch;
4572 s++;
4573 continue;
4574 }
4575
4576 n = utf8_code_length[ch];
4577 if (s + n > e) {
4578 goto surrogateescape;
4579 }
4580
4581 switch (n) {
4582 case 0:
4583 case 1:
4584 goto surrogateescape;
4585
4586 case 2:
4587 if ((s[1] & 0xc0) != 0x80)
4588 goto surrogateescape;
4589 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4590 assert ((ch > 0x007F) && (ch <= 0x07FF));
4591 *p++ = (wchar_t)ch;
4592 break;
4593
4594 case 3:
4595 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4596 will result in surrogates in range d800-dfff. Surrogates are
4597 not valid UTF-8 so they are rejected.
4598 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4599 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4600 if ((s[1] & 0xc0) != 0x80 ||
4601 (s[2] & 0xc0) != 0x80 ||
4602 ((unsigned char)s[0] == 0xE0 &&
4603 (unsigned char)s[1] < 0xA0) ||
4604 ((unsigned char)s[0] == 0xED &&
4605 (unsigned char)s[1] > 0x9F)) {
4606
4607 goto surrogateescape;
4608 }
4609 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4610 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004612 break;
4613
4614 case 4:
4615 if ((s[1] & 0xc0) != 0x80 ||
4616 (s[2] & 0xc0) != 0x80 ||
4617 (s[3] & 0xc0) != 0x80 ||
4618 ((unsigned char)s[0] == 0xF0 &&
4619 (unsigned char)s[1] < 0x90) ||
4620 ((unsigned char)s[0] == 0xF4 &&
4621 (unsigned char)s[1] > 0x8F)) {
4622 goto surrogateescape;
4623 }
4624 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4625 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4626 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4627
4628#if SIZEOF_WCHAR_T == 4
4629 *p++ = (wchar_t)ch;
4630#else
4631 /* compute and append the two surrogates: */
4632
4633 /* translate from 10000..10FFFF to 0..FFFF */
4634 ch -= 0x10000;
4635
4636 /* high surrogate = top 10 bits added to D800 */
4637 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4638
4639 /* low surrogate = bottom 10 bits added to DC00 */
4640 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4641#endif
4642 break;
4643 }
4644 s += n;
4645 continue;
4646
4647 surrogateescape:
4648 *p++ = 0xDC00 + ch;
4649 s++;
4650 }
4651 *p = L'\0';
4652 return unicode;
4653}
4654
4655#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004657/* Primary internal function which creates utf8 encoded bytes objects.
4658
4659 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004660 and allocate exactly as much space needed at the end. Else allocate the
4661 maximum possible needed (4 result bytes per Unicode character), and return
4662 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004663*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004664PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666{
Tim Peters602f7402002-04-27 18:03:26 +00004667#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004668
Guido van Rossum98297ee2007-11-06 21:34:58 +00004669 Py_ssize_t i; /* index into s of next input byte */
4670 PyObject *result; /* result string object */
4671 char *p; /* next free byte in output buffer */
4672 Py_ssize_t nallocated; /* number of result bytes allocated */
4673 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004674 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004675 PyObject *errorHandler = NULL;
4676 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 int kind;
4678 void *data;
4679 Py_ssize_t size;
4680 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4681#if SIZEOF_WCHAR_T == 2
4682 Py_ssize_t wchar_offset = 0;
4683#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685 if (!PyUnicode_Check(unicode)) {
4686 PyErr_BadArgument();
4687 return NULL;
4688 }
4689
4690 if (PyUnicode_READY(unicode) == -1)
4691 return NULL;
4692
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004693 if (PyUnicode_UTF8(unicode))
4694 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4695 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004696
4697 kind = PyUnicode_KIND(unicode);
4698 data = PyUnicode_DATA(unicode);
4699 size = PyUnicode_GET_LENGTH(unicode);
4700
Tim Peters602f7402002-04-27 18:03:26 +00004701 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702
Tim Peters602f7402002-04-27 18:03:26 +00004703 if (size <= MAX_SHORT_UNICHARS) {
4704 /* Write into the stack buffer; nallocated can't overflow.
4705 * At the end, we'll allocate exactly as much heap space as it
4706 * turns out we need.
4707 */
4708 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004709 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004710 p = stackbuf;
4711 }
4712 else {
4713 /* Overallocate on the heap, and give the excess back at the end. */
4714 nallocated = size * 4;
4715 if (nallocated / 4 != size) /* overflow! */
4716 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004717 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004718 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004719 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004720 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004721 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004722
Tim Peters602f7402002-04-27 18:03:26 +00004723 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004724 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004725
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004726 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004727 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004731 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004732 *p++ = (char)(0xc0 | (ch >> 6));
4733 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004734 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735 Py_ssize_t newpos;
4736 PyObject *rep;
4737 Py_ssize_t repsize, k, startpos;
4738 startpos = i-1;
4739#if SIZEOF_WCHAR_T == 2
4740 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004741#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742 rep = unicode_encode_call_errorhandler(
4743 errors, &errorHandler, "utf-8", "surrogates not allowed",
4744 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4745 &exc, startpos, startpos+1, &newpos);
4746 if (!rep)
4747 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004749 if (PyBytes_Check(rep))
4750 repsize = PyBytes_GET_SIZE(rep);
4751 else
4752 repsize = PyUnicode_GET_SIZE(rep);
4753
4754 if (repsize > 4) {
4755 Py_ssize_t offset;
4756
4757 if (result == NULL)
4758 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004759 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004760 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4763 /* integer overflow */
4764 PyErr_NoMemory();
4765 goto error;
4766 }
4767 nallocated += repsize - 4;
4768 if (result != NULL) {
4769 if (_PyBytes_Resize(&result, nallocated) < 0)
4770 goto error;
4771 } else {
4772 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004773 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774 goto error;
4775 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4776 }
4777 p = PyBytes_AS_STRING(result) + offset;
4778 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 if (PyBytes_Check(rep)) {
4781 char *prep = PyBytes_AS_STRING(rep);
4782 for(k = repsize; k > 0; k--)
4783 *p++ = *prep++;
4784 } else /* rep is unicode */ {
4785 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4786 Py_UNICODE c;
4787
4788 for(k=0; k<repsize; k++) {
4789 c = prep[k];
4790 if (0x80 <= c) {
4791 raise_encode_exception(&exc, "utf-8",
4792 PyUnicode_AS_UNICODE(unicode),
4793 size, i-1, i,
4794 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004795 goto error;
4796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004798 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004801 } else if (ch < 0x10000) {
4802 *p++ = (char)(0xe0 | (ch >> 12));
4803 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4804 *p++ = (char)(0x80 | (ch & 0x3f));
4805 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004806 /* Encode UCS4 Unicode ordinals */
4807 *p++ = (char)(0xf0 | (ch >> 18));
4808 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4809 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4810 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811#if SIZEOF_WCHAR_T == 2
4812 wchar_offset++;
4813#endif
Tim Peters602f7402002-04-27 18:03:26 +00004814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004816
Guido van Rossum98297ee2007-11-06 21:34:58 +00004817 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004818 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004819 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004820 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004821 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004822 }
4823 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004824 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004825 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004826 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004827 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004830 Py_XDECREF(errorHandler);
4831 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004832 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004833 error:
4834 Py_XDECREF(errorHandler);
4835 Py_XDECREF(exc);
4836 Py_XDECREF(result);
4837 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004838
Tim Peters602f7402002-04-27 18:03:26 +00004839#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840}
4841
Alexander Belopolsky40018472011-02-26 01:02:56 +00004842PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4844 Py_ssize_t size,
4845 const char *errors)
4846{
4847 PyObject *v, *unicode;
4848
4849 unicode = PyUnicode_FromUnicode(s, size);
4850 if (unicode == NULL)
4851 return NULL;
4852 v = _PyUnicode_AsUTF8String(unicode, errors);
4853 Py_DECREF(unicode);
4854 return v;
4855}
4856
4857PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004858PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
Walter Dörwald41980ca2007-08-16 21:55:45 +00004863/* --- UTF-32 Codec ------------------------------------------------------- */
4864
4865PyObject *
4866PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_ssize_t size,
4868 const char *errors,
4869 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870{
4871 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4872}
4873
4874PyObject *
4875PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 Py_ssize_t size,
4877 const char *errors,
4878 int *byteorder,
4879 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004880{
4881 const char *starts = s;
4882 Py_ssize_t startinpos;
4883 Py_ssize_t endinpos;
4884 Py_ssize_t outpos;
4885 PyUnicodeObject *unicode;
4886 Py_UNICODE *p;
4887#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004888 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004889 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004890#else
4891 const int pairs = 0;
4892#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004893 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004894 int bo = 0; /* assume native ordering by default */
4895 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896 /* Offsets from q for retrieving bytes in the right order. */
4897#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4898 int iorder[] = {0, 1, 2, 3};
4899#else
4900 int iorder[] = {3, 2, 1, 0};
4901#endif
4902 PyObject *errorHandler = NULL;
4903 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004904
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 q = (unsigned char *)s;
4906 e = q + size;
4907
4908 if (byteorder)
4909 bo = *byteorder;
4910
4911 /* Check for BOM marks (U+FEFF) in the input and adjust current
4912 byte order setting accordingly. In native mode, the leading BOM
4913 mark is skipped, in all other modes, it is copied to the output
4914 stream as-is (giving a ZWNBSP character). */
4915 if (bo == 0) {
4916 if (size >= 4) {
4917 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 if (bom == 0x0000FEFF) {
4921 q += 4;
4922 bo = -1;
4923 }
4924 else if (bom == 0xFFFE0000) {
4925 q += 4;
4926 bo = 1;
4927 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 if (bom == 0x0000FEFF) {
4930 q += 4;
4931 bo = 1;
4932 }
4933 else if (bom == 0xFFFE0000) {
4934 q += 4;
4935 bo = -1;
4936 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 }
4940
4941 if (bo == -1) {
4942 /* force LE */
4943 iorder[0] = 0;
4944 iorder[1] = 1;
4945 iorder[2] = 2;
4946 iorder[3] = 3;
4947 }
4948 else if (bo == 1) {
4949 /* force BE */
4950 iorder[0] = 3;
4951 iorder[1] = 2;
4952 iorder[2] = 1;
4953 iorder[3] = 0;
4954 }
4955
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004956 /* On narrow builds we split characters outside the BMP into two
4957 codepoints => count how much extra space we need. */
4958#ifndef Py_UNICODE_WIDE
4959 for (qq = q; qq < e; qq += 4)
4960 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4961 pairs++;
4962#endif
4963
4964 /* This might be one to much, because of a BOM */
4965 unicode = _PyUnicode_New((size+3)/4+pairs);
4966 if (!unicode)
4967 return NULL;
4968 if (size == 0)
4969 return (PyObject *)unicode;
4970
4971 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004972 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004973
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 Py_UCS4 ch;
4976 /* remaining bytes at the end? (size should be divisible by 4) */
4977 if (e-q<4) {
4978 if (consumed)
4979 break;
4980 errmsg = "truncated data";
4981 startinpos = ((const char *)q)-starts;
4982 endinpos = ((const char *)e)-starts;
4983 goto utf32Error;
4984 /* The remaining input chars are ignored if the callback
4985 chooses to skip the input */
4986 }
4987 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4988 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 if (ch >= 0x110000)
4991 {
4992 errmsg = "codepoint not in range(0x110000)";
4993 startinpos = ((const char *)q)-starts;
4994 endinpos = startinpos+4;
4995 goto utf32Error;
4996 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 if (ch >= 0x10000)
4999 {
5000 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5001 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5002 }
5003 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 *p++ = ch;
5006 q += 4;
5007 continue;
5008 utf32Error:
5009 outpos = p-PyUnicode_AS_UNICODE(unicode);
5010 if (unicode_decode_call_errorhandler(
5011 errors, &errorHandler,
5012 "utf32", errmsg,
5013 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5014 &unicode, &outpos, &p))
5015 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005016 }
5017
5018 if (byteorder)
5019 *byteorder = bo;
5020
5021 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023
5024 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005025 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026 goto onError;
5027
5028 Py_XDECREF(errorHandler);
5029 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005030#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005031 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032 Py_DECREF(unicode);
5033 return NULL;
5034 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005035#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005036 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037 return (PyObject *)unicode;
5038
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040 Py_DECREF(unicode);
5041 Py_XDECREF(errorHandler);
5042 Py_XDECREF(exc);
5043 return NULL;
5044}
5045
5046PyObject *
5047PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 Py_ssize_t size,
5049 const char *errors,
5050 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005052 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005054 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005056 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057#else
5058 const int pairs = 0;
5059#endif
5060 /* Offsets from p for storing byte pairs in the right order. */
5061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5062 int iorder[] = {0, 1, 2, 3};
5063#else
5064 int iorder[] = {3, 2, 1, 0};
5065#endif
5066
Benjamin Peterson29060642009-01-31 22:14:21 +00005067#define STORECHAR(CH) \
5068 do { \
5069 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5070 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5071 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5072 p[iorder[0]] = (CH) & 0xff; \
5073 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074 } while(0)
5075
5076 /* In narrow builds we can output surrogate pairs as one codepoint,
5077 so we need less space. */
5078#ifndef Py_UNICODE_WIDE
5079 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5081 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5082 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005084 nsize = (size - pairs + (byteorder == 0));
5085 bytesize = nsize * 4;
5086 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005088 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089 if (v == NULL)
5090 return NULL;
5091
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005092 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005096 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097
5098 if (byteorder == -1) {
5099 /* force LE */
5100 iorder[0] = 0;
5101 iorder[1] = 1;
5102 iorder[2] = 2;
5103 iorder[3] = 3;
5104 }
5105 else if (byteorder == 1) {
5106 /* force BE */
5107 iorder[0] = 3;
5108 iorder[1] = 2;
5109 iorder[2] = 1;
5110 iorder[3] = 0;
5111 }
5112
5113 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5117 Py_UCS4 ch2 = *s;
5118 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5119 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5120 s++;
5121 size--;
5122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005123 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124#endif
5125 STORECHAR(ch);
5126 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005127
5128 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005129 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130#undef STORECHAR
5131}
5132
Alexander Belopolsky40018472011-02-26 01:02:56 +00005133PyObject *
5134PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135{
5136 if (!PyUnicode_Check(unicode)) {
5137 PyErr_BadArgument();
5138 return NULL;
5139 }
5140 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 PyUnicode_GET_SIZE(unicode),
5142 NULL,
5143 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005144}
5145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146/* --- UTF-16 Codec ------------------------------------------------------- */
5147
Tim Peters772747b2001-08-09 22:21:55 +00005148PyObject *
5149PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 Py_ssize_t size,
5151 const char *errors,
5152 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153{
Walter Dörwald69652032004-09-07 20:24:22 +00005154 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5155}
5156
Antoine Pitrouab868312009-01-10 15:40:25 +00005157/* Two masks for fast checking of whether a C 'long' may contain
5158 UTF16-encoded surrogate characters. This is an efficient heuristic,
5159 assuming that non-surrogate characters with a code point >= 0x8000 are
5160 rare in most input.
5161 FAST_CHAR_MASK is used when the input is in native byte ordering,
5162 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005163*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005164#if (SIZEOF_LONG == 8)
5165# define FAST_CHAR_MASK 0x8000800080008000L
5166# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5167#elif (SIZEOF_LONG == 4)
5168# define FAST_CHAR_MASK 0x80008000L
5169# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5170#else
5171# error C 'long' size should be either 4 or 8!
5172#endif
5173
Walter Dörwald69652032004-09-07 20:24:22 +00005174PyObject *
5175PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 Py_ssize_t size,
5177 const char *errors,
5178 int *byteorder,
5179 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005180{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005182 Py_ssize_t startinpos;
5183 Py_ssize_t endinpos;
5184 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 PyUnicodeObject *unicode;
5186 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005187 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005188 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005189 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005190 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005191 /* Offsets from q for retrieving byte pairs in the right order. */
5192#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5193 int ihi = 1, ilo = 0;
5194#else
5195 int ihi = 0, ilo = 1;
5196#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 PyObject *errorHandler = NULL;
5198 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199
5200 /* Note: size will always be longer than the resulting Unicode
5201 character count */
5202 unicode = _PyUnicode_New(size);
5203 if (!unicode)
5204 return NULL;
5205 if (size == 0)
5206 return (PyObject *)unicode;
5207
5208 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005209 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005210 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005211 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
5213 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005214 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005216 /* Check for BOM marks (U+FEFF) in the input and adjust current
5217 byte order setting accordingly. In native mode, the leading BOM
5218 mark is skipped, in all other modes, it is copied to the output
5219 stream as-is (giving a ZWNBSP character). */
5220 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005221 if (size >= 2) {
5222 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 if (bom == 0xFEFF) {
5225 q += 2;
5226 bo = -1;
5227 }
5228 else if (bom == 0xFFFE) {
5229 q += 2;
5230 bo = 1;
5231 }
Tim Petersced69f82003-09-16 20:30:58 +00005232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 if (bom == 0xFEFF) {
5234 q += 2;
5235 bo = 1;
5236 }
5237 else if (bom == 0xFFFE) {
5238 q += 2;
5239 bo = -1;
5240 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005241#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
Tim Peters772747b2001-08-09 22:21:55 +00005245 if (bo == -1) {
5246 /* force LE */
5247 ihi = 1;
5248 ilo = 0;
5249 }
5250 else if (bo == 1) {
5251 /* force BE */
5252 ihi = 0;
5253 ilo = 1;
5254 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005255#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5256 native_ordering = ilo < ihi;
5257#else
5258 native_ordering = ilo > ihi;
5259#endif
Tim Peters772747b2001-08-09 22:21:55 +00005260
Antoine Pitrouab868312009-01-10 15:40:25 +00005261 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005262 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005264 /* First check for possible aligned read of a C 'long'. Unaligned
5265 reads are more expensive, better to defer to another iteration. */
5266 if (!((size_t) q & LONG_PTR_MASK)) {
5267 /* Fast path for runs of non-surrogate chars. */
5268 register const unsigned char *_q = q;
5269 Py_UNICODE *_p = p;
5270 if (native_ordering) {
5271 /* Native ordering is simple: as long as the input cannot
5272 possibly contain a surrogate char, do an unrolled copy
5273 of several 16-bit code points to the target object.
5274 The non-surrogate check is done on several input bytes
5275 at a time (as many as a C 'long' can contain). */
5276 while (_q < aligned_end) {
5277 unsigned long data = * (unsigned long *) _q;
5278 if (data & FAST_CHAR_MASK)
5279 break;
5280 _p[0] = ((unsigned short *) _q)[0];
5281 _p[1] = ((unsigned short *) _q)[1];
5282#if (SIZEOF_LONG == 8)
5283 _p[2] = ((unsigned short *) _q)[2];
5284 _p[3] = ((unsigned short *) _q)[3];
5285#endif
5286 _q += SIZEOF_LONG;
5287 _p += SIZEOF_LONG / 2;
5288 }
5289 }
5290 else {
5291 /* Byteswapped ordering is similar, but we must decompose
5292 the copy bytewise, and take care of zero'ing out the
5293 upper bytes if the target object is in 32-bit units
5294 (that is, in UCS-4 builds). */
5295 while (_q < aligned_end) {
5296 unsigned long data = * (unsigned long *) _q;
5297 if (data & SWAPPED_FAST_CHAR_MASK)
5298 break;
5299 /* Zero upper bytes in UCS-4 builds */
5300#if (Py_UNICODE_SIZE > 2)
5301 _p[0] = 0;
5302 _p[1] = 0;
5303#if (SIZEOF_LONG == 8)
5304 _p[2] = 0;
5305 _p[3] = 0;
5306#endif
5307#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005308 /* Issue #4916; UCS-4 builds on big endian machines must
5309 fill the two last bytes of each 4-byte unit. */
5310#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5311# define OFF 2
5312#else
5313# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005314#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005315 ((unsigned char *) _p)[OFF + 1] = _q[0];
5316 ((unsigned char *) _p)[OFF + 0] = _q[1];
5317 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5318 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5319#if (SIZEOF_LONG == 8)
5320 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5321 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5322 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5323 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5324#endif
5325#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 _q += SIZEOF_LONG;
5327 _p += SIZEOF_LONG / 2;
5328 }
5329 }
5330 p = _p;
5331 q = _q;
5332 if (q >= e)
5333 break;
5334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336
Benjamin Peterson14339b62009-01-31 16:36:08 +00005337 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338
5339 if (ch < 0xD800 || ch > 0xDFFF) {
5340 *p++ = ch;
5341 continue;
5342 }
5343
5344 /* UTF-16 code pair: */
5345 if (q > e) {
5346 errmsg = "unexpected end of data";
5347 startinpos = (((const char *)q) - 2) - starts;
5348 endinpos = ((const char *)e) + 1 - starts;
5349 goto utf16Error;
5350 }
5351 if (0xD800 <= ch && ch <= 0xDBFF) {
5352 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5353 q += 2;
5354 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005355#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 *p++ = ch;
5357 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005358#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005360#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 continue;
5362 }
5363 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005364 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 startinpos = (((const char *)q)-4)-starts;
5366 endinpos = startinpos+2;
5367 goto utf16Error;
5368 }
5369
Benjamin Peterson14339b62009-01-31 16:36:08 +00005370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 errmsg = "illegal encoding";
5372 startinpos = (((const char *)q)-2)-starts;
5373 endinpos = startinpos+2;
5374 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 utf16Error:
5377 outpos = p - PyUnicode_AS_UNICODE(unicode);
5378 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005379 errors,
5380 &errorHandler,
5381 "utf16", errmsg,
5382 &starts,
5383 (const char **)&e,
5384 &startinpos,
5385 &endinpos,
5386 &exc,
5387 (const char **)&q,
5388 &unicode,
5389 &outpos,
5390 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005393 /* remaining byte at the end? (size should be even) */
5394 if (e == q) {
5395 if (!consumed) {
5396 errmsg = "truncated data";
5397 startinpos = ((const char *)q) - starts;
5398 endinpos = ((const char *)e) + 1 - starts;
5399 outpos = p - PyUnicode_AS_UNICODE(unicode);
5400 if (unicode_decode_call_errorhandler(
5401 errors,
5402 &errorHandler,
5403 "utf16", errmsg,
5404 &starts,
5405 (const char **)&e,
5406 &startinpos,
5407 &endinpos,
5408 &exc,
5409 (const char **)&q,
5410 &unicode,
5411 &outpos,
5412 &p))
5413 goto onError;
5414 /* The remaining input chars are ignored if the callback
5415 chooses to skip the input */
5416 }
5417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
5419 if (byteorder)
5420 *byteorder = bo;
5421
Walter Dörwald69652032004-09-07 20:24:22 +00005422 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005426 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 goto onError;
5428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 Py_XDECREF(errorHandler);
5430 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005431#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005432 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 Py_DECREF(unicode);
5434 return NULL;
5435 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005436#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005437 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 return (PyObject *)unicode;
5439
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 Py_XDECREF(errorHandler);
5443 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 return NULL;
5445}
5446
Antoine Pitrouab868312009-01-10 15:40:25 +00005447#undef FAST_CHAR_MASK
5448#undef SWAPPED_FAST_CHAR_MASK
5449
Tim Peters772747b2001-08-09 22:21:55 +00005450PyObject *
5451PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 Py_ssize_t size,
5453 const char *errors,
5454 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005456 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005457 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005458 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005459#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005460 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005461#else
5462 const int pairs = 0;
5463#endif
Tim Peters772747b2001-08-09 22:21:55 +00005464 /* Offsets from p for storing byte pairs in the right order. */
5465#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5466 int ihi = 1, ilo = 0;
5467#else
5468 int ihi = 0, ilo = 1;
5469#endif
5470
Benjamin Peterson29060642009-01-31 22:14:21 +00005471#define STORECHAR(CH) \
5472 do { \
5473 p[ihi] = ((CH) >> 8) & 0xff; \
5474 p[ilo] = (CH) & 0xff; \
5475 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005476 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005478#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005479 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 if (s[i] >= 0x10000)
5481 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005482#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005483 /* 2 * (size + pairs + (byteorder == 0)) */
5484 if (size > PY_SSIZE_T_MAX ||
5485 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005487 nsize = size + pairs + (byteorder == 0);
5488 bytesize = nsize * 2;
5489 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005491 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (v == NULL)
5493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005495 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005498 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005499 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005500
5501 if (byteorder == -1) {
5502 /* force LE */
5503 ihi = 1;
5504 ilo = 0;
5505 }
5506 else if (byteorder == 1) {
5507 /* force BE */
5508 ihi = 0;
5509 ilo = 1;
5510 }
5511
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005512 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 Py_UNICODE ch = *s++;
5514 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005515#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 if (ch >= 0x10000) {
5517 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5518 ch = 0xD800 | ((ch-0x10000) >> 10);
5519 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005520#endif
Tim Peters772747b2001-08-09 22:21:55 +00005521 STORECHAR(ch);
5522 if (ch2)
5523 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005525
5526 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005527 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005528#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529}
5530
Alexander Belopolsky40018472011-02-26 01:02:56 +00005531PyObject *
5532PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
5534 if (!PyUnicode_Check(unicode)) {
5535 PyErr_BadArgument();
5536 return NULL;
5537 }
5538 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 PyUnicode_GET_SIZE(unicode),
5540 NULL,
5541 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542}
5543
5544/* --- Unicode Escape Codec ----------------------------------------------- */
5545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5547 if all the escapes in the string make it still a valid ASCII string.
5548 Returns -1 if any escapes were found which cause the string to
5549 pop out of ASCII range. Otherwise returns the length of the
5550 required buffer to hold the string.
5551 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005552static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5554{
5555 const unsigned char *p = (const unsigned char *)s;
5556 const unsigned char *end = p + size;
5557 Py_ssize_t length = 0;
5558
5559 if (size < 0)
5560 return -1;
5561
5562 for (; p < end; ++p) {
5563 if (*p > 127) {
5564 /* Non-ASCII */
5565 return -1;
5566 }
5567 else if (*p != '\\') {
5568 /* Normal character */
5569 ++length;
5570 }
5571 else {
5572 /* Backslash-escape, check next char */
5573 ++p;
5574 /* Escape sequence reaches till end of string or
5575 non-ASCII follow-up. */
5576 if (p >= end || *p > 127)
5577 return -1;
5578 switch (*p) {
5579 case '\n':
5580 /* backslash + \n result in zero characters */
5581 break;
5582 case '\\': case '\'': case '\"':
5583 case 'b': case 'f': case 't':
5584 case 'n': case 'r': case 'v': case 'a':
5585 ++length;
5586 break;
5587 case '0': case '1': case '2': case '3':
5588 case '4': case '5': case '6': case '7':
5589 case 'x': case 'u': case 'U': case 'N':
5590 /* these do not guarantee ASCII characters */
5591 return -1;
5592 default:
5593 /* count the backslash + the other character */
5594 length += 2;
5595 }
5596 }
5597 }
5598 return length;
5599}
5600
5601/* Similar to PyUnicode_WRITE but either write into wstr field
5602 or treat string as ASCII. */
5603#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5604 do { \
5605 if ((kind) != PyUnicode_WCHAR_KIND) \
5606 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5607 else \
5608 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5609 } while (0)
5610
5611#define WRITE_WSTR(buf, index, value) \
5612 assert(kind == PyUnicode_WCHAR_KIND), \
5613 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5614
5615
Fredrik Lundh06d12682001-01-24 07:59:11 +00005616static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005617
Alexander Belopolsky40018472011-02-26 01:02:56 +00005618PyObject *
5619PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005620 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005621 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005624 Py_ssize_t startinpos;
5625 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 char* message;
5631 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 PyObject *errorHandler = NULL;
5633 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 Py_ssize_t ascii_length;
5635 Py_ssize_t i;
5636 int kind;
5637 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 ascii_length = length_of_escaped_ascii_string(s, size);
5640
5641 /* After length_of_escaped_ascii_string() there are two alternatives,
5642 either the string is pure ASCII with named escapes like \n, etc.
5643 and we determined it's exact size (common case)
5644 or it contains \x, \u, ... escape sequences. then we create a
5645 legacy wchar string and resize it at the end of this function. */
5646 if (ascii_length >= 0) {
5647 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5648 if (!v)
5649 goto onError;
5650 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5651 kind = PyUnicode_1BYTE_KIND;
5652 data = PyUnicode_DATA(v);
5653 }
5654 else {
5655 /* Escaped strings will always be longer than the resulting
5656 Unicode string, so we start with size here and then reduce the
5657 length after conversion to the true value.
5658 (but if the error callback returns a long replacement string
5659 we'll have to allocate more space) */
5660 v = _PyUnicode_New(size);
5661 if (!v)
5662 goto onError;
5663 kind = PyUnicode_WCHAR_KIND;
5664 data = PyUnicode_AS_UNICODE(v);
5665 }
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 if (size == 0)
5668 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005671
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 while (s < end) {
5673 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005674 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 if (kind == PyUnicode_WCHAR_KIND) {
5678 assert(i < _PyUnicode_WSTR_LENGTH(v));
5679 }
5680 else {
5681 /* The only case in which i == ascii_length is a backslash
5682 followed by a newline. */
5683 assert(i <= ascii_length);
5684 }
5685
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 /* Non-escape characters are interpreted as Unicode ordinals */
5687 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 continue;
5690 }
5691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 /* \ - Escapes */
5694 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005695 c = *s++;
5696 if (s > end)
5697 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005698
5699 if (kind == PyUnicode_WCHAR_KIND) {
5700 assert(i < _PyUnicode_WSTR_LENGTH(v));
5701 }
5702 else {
5703 /* The only case in which i == ascii_length is a backslash
5704 followed by a newline. */
5705 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5706 }
5707
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005708 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005712 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5713 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5714 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5715 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5716 /* FF */
5717 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5718 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5719 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5720 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5721 /* VT */
5722 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5723 /* BEL, not classic C */
5724 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 case '0': case '1': case '2': case '3':
5728 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005729 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005730 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005731 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005732 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005733 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 break;
5737
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 /* hex escapes */
5739 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005741 digits = 2;
5742 message = "truncated \\xXX escape";
5743 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005747 digits = 4;
5748 message = "truncated \\uXXXX escape";
5749 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005753 digits = 8;
5754 message = "truncated \\UXXXXXXXX escape";
5755 hexescape:
5756 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 if (s+digits>end) {
5759 endinpos = size;
5760 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 errors, &errorHandler,
5762 "unicodeescape", "end of string in escape sequence",
5763 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005764 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005766 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 goto nextByte;
5768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769 for (j = 0; j < digits; ++j) {
5770 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005771 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 endinpos = (s+j+1)-starts;
5773 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 errors, &errorHandler,
5776 "unicodeescape", message,
5777 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005778 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005782 }
5783 chr = (chr<<4) & ~0xF;
5784 if (c >= '0' && c <= '9')
5785 chr += c - '0';
5786 else if (c >= 'a' && c <= 'f')
5787 chr += 10 + c - 'a';
5788 else
5789 chr += 10 + c - 'A';
5790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005792 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 /* _decoding_error will have already written into the
5794 target buffer. */
5795 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005796 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005797 /* when we get here, chr is a 32-bit unicode character */
5798 if (chr <= 0xffff)
5799 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005800 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005801 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005802 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005803 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005804#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005806#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005808 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5809 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005810#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005811 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005813 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 errors, &errorHandler,
5816 "unicodeescape", "illegal Unicode character",
5817 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005818 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005819 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 break;
5823
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005825 case 'N':
5826 message = "malformed \\N character escape";
5827 if (ucnhash_CAPI == NULL) {
5828 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5830 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005831 if (ucnhash_CAPI == NULL)
5832 goto ucnhashError;
5833 }
5834 if (*s == '{') {
5835 const char *start = s+1;
5836 /* look for the closing brace */
5837 while (*s != '}' && s < end)
5838 s++;
5839 if (s > start && s < end && *s == '}') {
5840 /* found a name. look it up in the unicode database */
5841 message = "unknown Unicode character name";
5842 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5844 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005845 goto store;
5846 }
5847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 errors, &errorHandler,
5852 "unicodeescape", message,
5853 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005854 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005855 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005857 break;
5858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005861 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 message = "\\ at end of string";
5863 s--;
5864 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 errors, &errorHandler,
5868 "unicodeescape", message,
5869 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005871 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005872 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005873 }
5874 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005875 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5876 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005877 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005878 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005883 /* Ensure the length prediction worked in case of ASCII strings */
5884 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5885
Victor Stinnerfe226c02011-10-03 03:52:20 +02005886 if (kind == PyUnicode_WCHAR_KIND)
5887 {
5888 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5889 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005890 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005891 Py_XDECREF(errorHandler);
5892 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005893#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005894 if (_PyUnicode_READY_REPLACE(&v)) {
5895 Py_DECREF(v);
5896 return NULL;
5897 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005898#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005899 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005901
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005903 PyErr_SetString(
5904 PyExc_UnicodeError,
5905 "\\N escapes not supported (can't load unicodedata module)"
5906 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005907 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 Py_XDECREF(errorHandler);
5909 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005910 return NULL;
5911
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 Py_XDECREF(errorHandler);
5915 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return NULL;
5917}
5918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005919#undef WRITE_ASCII_OR_WSTR
5920#undef WRITE_WSTR
5921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922/* Return a Unicode-Escape string version of the Unicode object.
5923
5924 If quotes is true, the string is enclosed in u"" or u'' quotes as
5925 appropriate.
5926
5927*/
5928
Walter Dörwald79e913e2007-05-12 11:08:06 +00005929static const char *hexdigits = "0123456789abcdef";
5930
Alexander Belopolsky40018472011-02-26 01:02:56 +00005931PyObject *
5932PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005935 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005938#ifdef Py_UNICODE_WIDE
5939 const Py_ssize_t expandsize = 10;
5940#else
5941 const Py_ssize_t expandsize = 6;
5942#endif
5943
Thomas Wouters89f507f2006-12-13 04:49:30 +00005944 /* XXX(nnorwitz): rather than over-allocating, it would be
5945 better to choose a different scheme. Perhaps scan the
5946 first N-chars of the string and allocate based on that size.
5947 */
5948 /* Initial allocation is based on the longest-possible unichr
5949 escape.
5950
5951 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5952 unichr, so in this case it's the longest unichr escape. In
5953 narrow (UTF-16) builds this is five chars per source unichr
5954 since there are two unichrs in the surrogate pair, so in narrow
5955 (UTF-16) builds it's not the longest unichr escape.
5956
5957 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5958 so in the narrow (UTF-16) build case it's the longest unichr
5959 escape.
5960 */
5961
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005962 if (size == 0)
5963 return PyBytes_FromStringAndSize(NULL, 0);
5964
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005965 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005967
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005968 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 2
5970 + expandsize*size
5971 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 if (repr == NULL)
5973 return NULL;
5974
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005975 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 while (size-- > 0) {
5978 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005979
Walter Dörwald79e913e2007-05-12 11:08:06 +00005980 /* Escape backslashes */
5981 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 *p++ = '\\';
5983 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005984 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005986
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005987#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005988 /* Map 21-bit characters to '\U00xxxxxx' */
5989 else if (ch >= 0x10000) {
5990 *p++ = '\\';
5991 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005992 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5993 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5994 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5995 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5996 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5997 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5998 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5999 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006001 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006002#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6004 else if (ch >= 0xD800 && ch < 0xDC00) {
6005 Py_UNICODE ch2;
6006 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 ch2 = *s++;
6009 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006010 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6012 *p++ = '\\';
6013 *p++ = 'U';
6014 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6015 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6016 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6017 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6018 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6019 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6020 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6021 *p++ = hexdigits[ucs & 0x0000000F];
6022 continue;
6023 }
6024 /* Fall through: isolated surrogates are copied as-is */
6025 s--;
6026 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006027 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006028#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006031 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 *p++ = '\\';
6033 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006034 *p++ = hexdigits[(ch >> 12) & 0x000F];
6035 *p++ = hexdigits[(ch >> 8) & 0x000F];
6036 *p++ = hexdigits[(ch >> 4) & 0x000F];
6037 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006039
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006040 /* Map special whitespace to '\t', \n', '\r' */
6041 else if (ch == '\t') {
6042 *p++ = '\\';
6043 *p++ = 't';
6044 }
6045 else if (ch == '\n') {
6046 *p++ = '\\';
6047 *p++ = 'n';
6048 }
6049 else if (ch == '\r') {
6050 *p++ = '\\';
6051 *p++ = 'r';
6052 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006054 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006055 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006057 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006058 *p++ = hexdigits[(ch >> 4) & 0x000F];
6059 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006060 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Copy everything else as-is */
6063 else
6064 *p++ = (char) ch;
6065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006067 assert(p - PyBytes_AS_STRING(repr) > 0);
6068 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6069 return NULL;
6070 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071}
6072
Alexander Belopolsky40018472011-02-26 01:02:56 +00006073PyObject *
6074PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006076 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 if (!PyUnicode_Check(unicode)) {
6078 PyErr_BadArgument();
6079 return NULL;
6080 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006081 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6082 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006083 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084}
6085
6086/* --- Raw Unicode Escape Codec ------------------------------------------- */
6087
Alexander Belopolsky40018472011-02-26 01:02:56 +00006088PyObject *
6089PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006090 Py_ssize_t size,
6091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 Py_ssize_t startinpos;
6095 Py_ssize_t endinpos;
6096 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 const char *end;
6100 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 PyObject *errorHandler = NULL;
6102 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006103
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 /* Escaped strings will always be longer than the resulting
6105 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 length after conversion to the true value. (But decoding error
6107 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 v = _PyUnicode_New(size);
6109 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 end = s + size;
6115 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 unsigned char c;
6117 Py_UCS4 x;
6118 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006119 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* Non-escape characters are interpreted as Unicode ordinals */
6122 if (*s != '\\') {
6123 *p++ = (unsigned char)*s++;
6124 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006125 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 startinpos = s-starts;
6127
6128 /* \u-escapes are only interpreted iff the number of leading
6129 backslashes if odd */
6130 bs = s;
6131 for (;s < end;) {
6132 if (*s != '\\')
6133 break;
6134 *p++ = (unsigned char)*s++;
6135 }
6136 if (((s - bs) & 1) == 0 ||
6137 s >= end ||
6138 (*s != 'u' && *s != 'U')) {
6139 continue;
6140 }
6141 p--;
6142 count = *s=='u' ? 4 : 8;
6143 s++;
6144
6145 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6146 outpos = p-PyUnicode_AS_UNICODE(v);
6147 for (x = 0, i = 0; i < count; ++i, ++s) {
6148 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006149 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 endinpos = s-starts;
6151 if (unicode_decode_call_errorhandler(
6152 errors, &errorHandler,
6153 "rawunicodeescape", "truncated \\uXXXX",
6154 &starts, &end, &startinpos, &endinpos, &exc, &s,
6155 &v, &outpos, &p))
6156 goto onError;
6157 goto nextByte;
6158 }
6159 x = (x<<4) & ~0xF;
6160 if (c >= '0' && c <= '9')
6161 x += c - '0';
6162 else if (c >= 'a' && c <= 'f')
6163 x += 10 + c - 'a';
6164 else
6165 x += 10 + c - 'A';
6166 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006167 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* UCS-2 character */
6169 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006170 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* UCS-4 character. Either store directly, or as
6172 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006173#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006175#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 x -= 0x10000L;
6177 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6178 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006179#endif
6180 } else {
6181 endinpos = s-starts;
6182 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006183 if (unicode_decode_call_errorhandler(
6184 errors, &errorHandler,
6185 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 &starts, &end, &startinpos, &endinpos, &exc, &s,
6187 &v, &outpos, &p))
6188 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 nextByte:
6191 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006193 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195 Py_XDECREF(errorHandler);
6196 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006197#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006198 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006199 Py_DECREF(v);
6200 return NULL;
6201 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006202#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006203 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006205
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(errorHandler);
6209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return NULL;
6211}
6212
Alexander Belopolsky40018472011-02-26 01:02:56 +00006213PyObject *
6214PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006215 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006217 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 char *p;
6219 char *q;
6220
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006221#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006222 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006223#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006224 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006225#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006226
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006227 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006230 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (repr == NULL)
6232 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006233 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006234 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006236 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 while (size-- > 0) {
6238 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* Map 32-bit characters to '\Uxxxxxxxx' */
6241 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006242 *p++ = '\\';
6243 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006244 *p++ = hexdigits[(ch >> 28) & 0xf];
6245 *p++ = hexdigits[(ch >> 24) & 0xf];
6246 *p++ = hexdigits[(ch >> 20) & 0xf];
6247 *p++ = hexdigits[(ch >> 16) & 0xf];
6248 *p++ = hexdigits[(ch >> 12) & 0xf];
6249 *p++ = hexdigits[(ch >> 8) & 0xf];
6250 *p++ = hexdigits[(ch >> 4) & 0xf];
6251 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006252 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006253 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006254#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6256 if (ch >= 0xD800 && ch < 0xDC00) {
6257 Py_UNICODE ch2;
6258 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006259
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 ch2 = *s++;
6261 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006262 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6264 *p++ = '\\';
6265 *p++ = 'U';
6266 *p++ = hexdigits[(ucs >> 28) & 0xf];
6267 *p++ = hexdigits[(ucs >> 24) & 0xf];
6268 *p++ = hexdigits[(ucs >> 20) & 0xf];
6269 *p++ = hexdigits[(ucs >> 16) & 0xf];
6270 *p++ = hexdigits[(ucs >> 12) & 0xf];
6271 *p++ = hexdigits[(ucs >> 8) & 0xf];
6272 *p++ = hexdigits[(ucs >> 4) & 0xf];
6273 *p++ = hexdigits[ucs & 0xf];
6274 continue;
6275 }
6276 /* Fall through: isolated surrogates are copied as-is */
6277 s--;
6278 size++;
6279 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006280#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 /* Map 16-bit characters to '\uxxxx' */
6282 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 *p++ = '\\';
6284 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006285 *p++ = hexdigits[(ch >> 12) & 0xf];
6286 *p++ = hexdigits[(ch >> 8) & 0xf];
6287 *p++ = hexdigits[(ch >> 4) & 0xf];
6288 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* Copy everything else as-is */
6291 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 *p++ = (char) ch;
6293 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006294 size = p - q;
6295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006296 assert(size > 0);
6297 if (_PyBytes_Resize(&repr, size) < 0)
6298 return NULL;
6299 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300}
6301
Alexander Belopolsky40018472011-02-26 01:02:56 +00006302PyObject *
6303PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006305 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006307 PyErr_BadArgument();
6308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006310 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6311 PyUnicode_GET_SIZE(unicode));
6312
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006313 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314}
6315
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316/* --- Unicode Internal Codec ------------------------------------------- */
6317
Alexander Belopolsky40018472011-02-26 01:02:56 +00006318PyObject *
6319_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006320 Py_ssize_t size,
6321 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006322{
6323 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006324 Py_ssize_t startinpos;
6325 Py_ssize_t endinpos;
6326 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006327 PyUnicodeObject *v;
6328 Py_UNICODE *p;
6329 const char *end;
6330 const char *reason;
6331 PyObject *errorHandler = NULL;
6332 PyObject *exc = NULL;
6333
Neal Norwitzd43069c2006-01-08 01:12:10 +00006334#ifdef Py_UNICODE_WIDE
6335 Py_UNICODE unimax = PyUnicode_GetMax();
6336#endif
6337
Thomas Wouters89f507f2006-12-13 04:49:30 +00006338 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006339 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6340 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006342 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6343 as string was created with the old API. */
6344 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006346 p = PyUnicode_AS_UNICODE(v);
6347 end = s + size;
6348
6349 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006351 /* We have to sanity check the raw data, otherwise doom looms for
6352 some malformed UCS-4 data. */
6353 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006354#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006355 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006356#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006357 end-s < Py_UNICODE_SIZE
6358 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006360 startinpos = s - starts;
6361 if (end-s < Py_UNICODE_SIZE) {
6362 endinpos = end-starts;
6363 reason = "truncated input";
6364 }
6365 else {
6366 endinpos = s - starts + Py_UNICODE_SIZE;
6367 reason = "illegal code point (> 0x10FFFF)";
6368 }
6369 outpos = p - PyUnicode_AS_UNICODE(v);
6370 if (unicode_decode_call_errorhandler(
6371 errors, &errorHandler,
6372 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006373 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006374 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006375 goto onError;
6376 }
6377 }
6378 else {
6379 p++;
6380 s += Py_UNICODE_SIZE;
6381 }
6382 }
6383
Victor Stinnerfe226c02011-10-03 03:52:20 +02006384 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006385 goto onError;
6386 Py_XDECREF(errorHandler);
6387 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006388#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006389 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006390 Py_DECREF(v);
6391 return NULL;
6392 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006393#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006394 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006395 return (PyObject *)v;
6396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006398 Py_XDECREF(v);
6399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
6401 return NULL;
6402}
6403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404/* --- Latin-1 Codec ------------------------------------------------------ */
6405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
6407PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006408 Py_ssize_t size,
6409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006412 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413}
6414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006416static void
6417make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006418 const char *encoding,
6419 const Py_UNICODE *unicode, Py_ssize_t size,
6420 Py_ssize_t startpos, Py_ssize_t endpos,
6421 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 *exceptionObject = PyUnicodeEncodeError_Create(
6425 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
6427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6429 goto onError;
6430 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6431 goto onError;
6432 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6433 goto onError;
6434 return;
6435 onError:
6436 Py_DECREF(*exceptionObject);
6437 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 }
6439}
6440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442static void
6443raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006444 const char *encoding,
6445 const Py_UNICODE *unicode, Py_ssize_t size,
6446 Py_ssize_t startpos, Py_ssize_t endpos,
6447 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448{
6449 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453}
6454
6455/* error handling callback helper:
6456 build arguments, call the callback and check the arguments,
6457 put the result into newpos and return the replacement string, which
6458 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006459static PyObject *
6460unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006461 PyObject **errorHandler,
6462 const char *encoding, const char *reason,
6463 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6464 Py_ssize_t startpos, Py_ssize_t endpos,
6465 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006467 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468
6469 PyObject *restuple;
6470 PyObject *resunicode;
6471
6472 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 }
6477
6478 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
6483 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006488 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 Py_DECREF(restuple);
6490 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006492 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 &resunicode, newpos)) {
6494 Py_DECREF(restuple);
6495 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006497 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6498 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6499 Py_DECREF(restuple);
6500 return NULL;
6501 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006504 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6506 Py_DECREF(restuple);
6507 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006508 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509 Py_INCREF(resunicode);
6510 Py_DECREF(restuple);
6511 return resunicode;
6512}
6513
Alexander Belopolsky40018472011-02-26 01:02:56 +00006514static PyObject *
6515unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t size,
6517 const char *errors,
6518 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519{
6520 /* output object */
6521 PyObject *res;
6522 /* pointers to the beginning and end+1 of input */
6523 const Py_UNICODE *startp = p;
6524 const Py_UNICODE *endp = p + size;
6525 /* pointer to the beginning of the unencodable characters */
6526 /* const Py_UNICODE *badp = NULL; */
6527 /* pointer into the output */
6528 char *str;
6529 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006531 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6532 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533 PyObject *errorHandler = NULL;
6534 PyObject *exc = NULL;
6535 /* the following variable is used for caching string comparisons
6536 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6537 int known_errorHandler = -1;
6538
6539 /* allocate enough for a simple encoding without
6540 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006541 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006542 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006543 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006545 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006546 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 ressize = size;
6548
6549 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 /* can we encode this? */
6553 if (c<limit) {
6554 /* no overflow check, because we know that the space is enough */
6555 *str++ = (char)c;
6556 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 else {
6559 Py_ssize_t unicodepos = p-startp;
6560 Py_ssize_t requiredsize;
6561 PyObject *repunicode;
6562 Py_ssize_t repsize;
6563 Py_ssize_t newpos;
6564 Py_ssize_t respos;
6565 Py_UNICODE *uni2;
6566 /* startpos for collecting unencodable chars */
6567 const Py_UNICODE *collstart = p;
6568 const Py_UNICODE *collend = p;
6569 /* find all unecodable characters */
6570 while ((collend < endp) && ((*collend)>=limit))
6571 ++collend;
6572 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6573 if (known_errorHandler==-1) {
6574 if ((errors==NULL) || (!strcmp(errors, "strict")))
6575 known_errorHandler = 1;
6576 else if (!strcmp(errors, "replace"))
6577 known_errorHandler = 2;
6578 else if (!strcmp(errors, "ignore"))
6579 known_errorHandler = 3;
6580 else if (!strcmp(errors, "xmlcharrefreplace"))
6581 known_errorHandler = 4;
6582 else
6583 known_errorHandler = 0;
6584 }
6585 switch (known_errorHandler) {
6586 case 1: /* strict */
6587 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6588 goto onError;
6589 case 2: /* replace */
6590 while (collstart++<collend)
6591 *str++ = '?'; /* fall through */
6592 case 3: /* ignore */
6593 p = collend;
6594 break;
6595 case 4: /* xmlcharrefreplace */
6596 respos = str - PyBytes_AS_STRING(res);
6597 /* determine replacement size (temporarily (mis)uses p) */
6598 for (p = collstart, repsize = 0; p < collend; ++p) {
6599 if (*p<10)
6600 repsize += 2+1+1;
6601 else if (*p<100)
6602 repsize += 2+2+1;
6603 else if (*p<1000)
6604 repsize += 2+3+1;
6605 else if (*p<10000)
6606 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006607#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 else
6609 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006610#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 else if (*p<100000)
6612 repsize += 2+5+1;
6613 else if (*p<1000000)
6614 repsize += 2+6+1;
6615 else
6616 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006617#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 }
6619 requiredsize = respos+repsize+(endp-collend);
6620 if (requiredsize > ressize) {
6621 if (requiredsize<2*ressize)
6622 requiredsize = 2*ressize;
6623 if (_PyBytes_Resize(&res, requiredsize))
6624 goto onError;
6625 str = PyBytes_AS_STRING(res) + respos;
6626 ressize = requiredsize;
6627 }
6628 /* generate replacement (temporarily (mis)uses p) */
6629 for (p = collstart; p < collend; ++p) {
6630 str += sprintf(str, "&#%d;", (int)*p);
6631 }
6632 p = collend;
6633 break;
6634 default:
6635 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6636 encoding, reason, startp, size, &exc,
6637 collstart-startp, collend-startp, &newpos);
6638 if (repunicode == NULL)
6639 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006640 if (PyBytes_Check(repunicode)) {
6641 /* Directly copy bytes result to output. */
6642 repsize = PyBytes_Size(repunicode);
6643 if (repsize > 1) {
6644 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006645 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006646 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6647 Py_DECREF(repunicode);
6648 goto onError;
6649 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006650 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006651 ressize += repsize-1;
6652 }
6653 memcpy(str, PyBytes_AsString(repunicode), repsize);
6654 str += repsize;
6655 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006656 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006657 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 /* need more space? (at least enough for what we
6660 have+the replacement+the rest of the string, so
6661 we won't have to check space for encodable characters) */
6662 respos = str - PyBytes_AS_STRING(res);
6663 repsize = PyUnicode_GET_SIZE(repunicode);
6664 requiredsize = respos+repsize+(endp-collend);
6665 if (requiredsize > ressize) {
6666 if (requiredsize<2*ressize)
6667 requiredsize = 2*ressize;
6668 if (_PyBytes_Resize(&res, requiredsize)) {
6669 Py_DECREF(repunicode);
6670 goto onError;
6671 }
6672 str = PyBytes_AS_STRING(res) + respos;
6673 ressize = requiredsize;
6674 }
6675 /* check if there is anything unencodable in the replacement
6676 and copy it to the output */
6677 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6678 c = *uni2;
6679 if (c >= limit) {
6680 raise_encode_exception(&exc, encoding, startp, size,
6681 unicodepos, unicodepos+1, reason);
6682 Py_DECREF(repunicode);
6683 goto onError;
6684 }
6685 *str = (char)c;
6686 }
6687 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006688 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006690 }
6691 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006692 /* Resize if we allocated to much */
6693 size = str - PyBytes_AS_STRING(res);
6694 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006695 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006696 if (_PyBytes_Resize(&res, size) < 0)
6697 goto onError;
6698 }
6699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 Py_XDECREF(errorHandler);
6701 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006702 return res;
6703
6704 onError:
6705 Py_XDECREF(res);
6706 Py_XDECREF(errorHandler);
6707 Py_XDECREF(exc);
6708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709}
6710
Alexander Belopolsky40018472011-02-26 01:02:56 +00006711PyObject *
6712PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006713 Py_ssize_t size,
6714 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Alexander Belopolsky40018472011-02-26 01:02:56 +00006719PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
6722 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 PyErr_BadArgument();
6724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006726 if (PyUnicode_READY(unicode) == -1)
6727 return NULL;
6728 /* Fast path: if it is a one-byte string, construct
6729 bytes object directly. */
6730 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6731 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6732 PyUnicode_GET_LENGTH(unicode));
6733 /* Non-Latin-1 characters present. Defer to above function to
6734 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006737 errors);
6738}
6739
6740PyObject*
6741PyUnicode_AsLatin1String(PyObject *unicode)
6742{
6743 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
6746/* --- 7-bit ASCII Codec -------------------------------------------------- */
6747
Alexander Belopolsky40018472011-02-26 01:02:56 +00006748PyObject *
6749PyUnicode_DecodeASCII(const char *s,
6750 Py_ssize_t size,
6751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006755 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756 Py_ssize_t startinpos;
6757 Py_ssize_t endinpos;
6758 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006759 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006760 int has_error;
6761 const unsigned char *p = (const unsigned char *)s;
6762 const unsigned char *end = p + size;
6763 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764 PyObject *errorHandler = NULL;
6765 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006768 if (size == 1 && (unsigned char)s[0] < 128)
6769 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006770
Victor Stinner702c7342011-10-05 13:50:52 +02006771 has_error = 0;
6772 while (p < end && !has_error) {
6773 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6774 an explanation. */
6775 if (!((size_t) p & LONG_PTR_MASK)) {
6776 /* Help register allocation */
6777 register const unsigned char *_p = p;
6778 while (_p < aligned_end) {
6779 unsigned long value = *(unsigned long *) _p;
6780 if (value & ASCII_CHAR_MASK) {
6781 has_error = 1;
6782 break;
6783 }
6784 _p += SIZEOF_LONG;
6785 }
6786 if (_p == end)
6787 break;
6788 if (has_error)
6789 break;
6790 p = _p;
6791 }
6792 if (*p & 0x80) {
6793 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006794 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006795 }
6796 else {
6797 ++p;
6798 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006799 }
Victor Stinner702c7342011-10-05 13:50:52 +02006800 if (!has_error)
6801 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 v = _PyUnicode_New(size);
6804 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006808 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 e = s + size;
6810 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 register unsigned char c = (unsigned char)*s;
6812 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006813 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 ++s;
6815 }
6816 else {
6817 startinpos = s-starts;
6818 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006819 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 if (unicode_decode_call_errorhandler(
6821 errors, &errorHandler,
6822 "ascii", "ordinal not in range(128)",
6823 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006824 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 goto onError;
6826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Victor Stinner702c7342011-10-05 13:50:52 +02006828 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6829 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831 Py_XDECREF(errorHandler);
6832 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006833#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006834 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006835 Py_DECREF(v);
6836 return NULL;
6837 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006838#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006839 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006841
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 Py_XDECREF(errorHandler);
6845 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return NULL;
6847}
6848
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849PyObject *
6850PyUnicode_EncodeASCII(const Py_UNICODE *p,
6851 Py_ssize_t size,
6852 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006854 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006858_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859{
6860 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 PyErr_BadArgument();
6862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006864 if (PyUnicode_READY(unicode) == -1)
6865 return NULL;
6866 /* Fast path: if it is an ASCII-only string, construct bytes object
6867 directly. Else defer to above function to raise the exception. */
6868 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6869 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6870 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873 errors);
6874}
6875
6876PyObject *
6877PyUnicode_AsASCIIString(PyObject *unicode)
6878{
6879 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880}
6881
Victor Stinner99b95382011-07-04 14:23:54 +02006882#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006883
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006884/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006885
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006886#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887#define NEED_RETRY
6888#endif
6889
6890/* XXX This code is limited to "true" double-byte encodings, as
6891 a) it assumes an incomplete character consists of a single byte, and
6892 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895static int
6896is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897{
6898 const char *curr = s + offset;
6899
6900 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 const char *prev = CharPrev(s, curr);
6902 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 }
6904 return 0;
6905}
6906
6907/*
6908 * Decode MBCS string into unicode object. If 'final' is set, converts
6909 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6910 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911static int
6912decode_mbcs(PyUnicodeObject **v,
6913 const char *s, /* MBCS string */
6914 int size, /* sizeof MBCS string */
6915 int final,
6916 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917{
6918 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006919 Py_ssize_t n;
6920 DWORD usize;
6921 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922
6923 assert(size >= 0);
6924
Victor Stinner554f3f02010-06-16 23:33:54 +00006925 /* check and handle 'errors' arg */
6926 if (errors==NULL || strcmp(errors, "strict")==0)
6927 flags = MB_ERR_INVALID_CHARS;
6928 else if (strcmp(errors, "ignore")==0)
6929 flags = 0;
6930 else {
6931 PyErr_Format(PyExc_ValueError,
6932 "mbcs encoding does not support errors='%s'",
6933 errors);
6934 return -1;
6935 }
6936
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937 /* Skip trailing lead-byte unless 'final' is set */
6938 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006940
6941 /* First get the size of the result */
6942 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006943 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6944 if (usize==0)
6945 goto mbcs_decode_error;
6946 } else
6947 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948
6949 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 /* Create unicode object */
6951 *v = _PyUnicode_New(usize);
6952 if (*v == NULL)
6953 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006954 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 }
6956 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 /* Extend unicode object */
6958 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006959 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961 }
6962
6963 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006964 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006966 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6967 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006971
6972mbcs_decode_error:
6973 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6974 we raise a UnicodeDecodeError - else it is a 'generic'
6975 windows error
6976 */
6977 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6978 /* Ideally, we should get reason from FormatMessage - this
6979 is the Windows 2000 English version of the message
6980 */
6981 PyObject *exc = NULL;
6982 const char *reason = "No mapping for the Unicode character exists "
6983 "in the target multi-byte code page.";
6984 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6985 if (exc != NULL) {
6986 PyCodec_StrictErrors(exc);
6987 Py_DECREF(exc);
6988 }
6989 } else {
6990 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6991 }
6992 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993}
6994
Alexander Belopolsky40018472011-02-26 01:02:56 +00006995PyObject *
6996PyUnicode_DecodeMBCSStateful(const char *s,
6997 Py_ssize_t size,
6998 const char *errors,
6999 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000{
7001 PyUnicodeObject *v = NULL;
7002 int done;
7003
7004 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007006
7007#ifdef NEED_RETRY
7008 retry:
7009 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007010 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011 else
7012#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007013 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014
7015 if (done < 0) {
7016 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007018 }
7019
7020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022
7023#ifdef NEED_RETRY
7024 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 s += done;
7026 size -= done;
7027 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028 }
7029#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02007030#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007031 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007032 Py_DECREF(v);
7033 return NULL;
7034 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007035#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007036 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 return (PyObject *)v;
7038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
7041PyUnicode_DecodeMBCS(const char *s,
7042 Py_ssize_t size,
7043 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007044{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7046}
7047
7048/*
7049 * Convert unicode into string object (MBCS).
7050 * Returns 0 if succeed, -1 otherwise.
7051 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007052static int
7053encode_mbcs(PyObject **repr,
7054 const Py_UNICODE *p, /* unicode */
7055 int size, /* size of unicode */
7056 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057{
Victor Stinner554f3f02010-06-16 23:33:54 +00007058 BOOL usedDefaultChar = FALSE;
7059 BOOL *pusedDefaultChar;
7060 int mbcssize;
7061 Py_ssize_t n;
7062 PyObject *exc = NULL;
7063 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064
7065 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007066
Victor Stinner554f3f02010-06-16 23:33:54 +00007067 /* check and handle 'errors' arg */
7068 if (errors==NULL || strcmp(errors, "strict")==0) {
7069 flags = WC_NO_BEST_FIT_CHARS;
7070 pusedDefaultChar = &usedDefaultChar;
7071 } else if (strcmp(errors, "replace")==0) {
7072 flags = 0;
7073 pusedDefaultChar = NULL;
7074 } else {
7075 PyErr_Format(PyExc_ValueError,
7076 "mbcs encoding does not support errors='%s'",
7077 errors);
7078 return -1;
7079 }
7080
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007081 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007083 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7084 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 if (mbcssize == 0) {
7086 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7087 return -1;
7088 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007089 /* If we used a default char, then we failed! */
7090 if (pusedDefaultChar && *pusedDefaultChar)
7091 goto mbcs_encode_error;
7092 } else {
7093 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007094 }
7095
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 /* Create string object */
7098 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7099 if (*repr == NULL)
7100 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007101 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 }
7103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 /* Extend string object */
7105 n = PyBytes_Size(*repr);
7106 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7107 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 }
7109
7110 /* Do the conversion */
7111 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007113 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7114 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7116 return -1;
7117 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007118 if (pusedDefaultChar && *pusedDefaultChar)
7119 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007122
7123mbcs_encode_error:
7124 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7125 Py_XDECREF(exc);
7126 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007127}
7128
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129PyObject *
7130PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7131 Py_ssize_t size,
7132 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007133{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 PyObject *repr = NULL;
7135 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007140 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141 else
7142#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007143 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007144
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 Py_XDECREF(repr);
7147 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007148 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149
7150#ifdef NEED_RETRY
7151 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 p += INT_MAX;
7153 size -= INT_MAX;
7154 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 }
7156#endif
7157
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007158 return repr;
7159}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007160
Alexander Belopolsky40018472011-02-26 01:02:56 +00007161PyObject *
7162PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007163{
7164 if (!PyUnicode_Check(unicode)) {
7165 PyErr_BadArgument();
7166 return NULL;
7167 }
7168 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 PyUnicode_GET_SIZE(unicode),
7170 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007171}
7172
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173#undef NEED_RETRY
7174
Victor Stinner99b95382011-07-04 14:23:54 +02007175#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007176
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177/* --- Character Mapping Codec -------------------------------------------- */
7178
Alexander Belopolsky40018472011-02-26 01:02:56 +00007179PyObject *
7180PyUnicode_DecodeCharmap(const char *s,
7181 Py_ssize_t size,
7182 PyObject *mapping,
7183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186 Py_ssize_t startinpos;
7187 Py_ssize_t endinpos;
7188 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 PyUnicodeObject *v;
7191 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 PyObject *errorHandler = NULL;
7194 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007195 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007197
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 /* Default to Latin-1 */
7199 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201
7202 v = _PyUnicode_New(size);
7203 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007209 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 mapstring = PyUnicode_AS_UNICODE(mapping);
7211 maplen = PyUnicode_GET_SIZE(mapping);
7212 while (s < e) {
7213 unsigned char ch = *s;
7214 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 if (ch < maplen)
7217 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 if (x == 0xfffe) {
7220 /* undefined mapping */
7221 outpos = p-PyUnicode_AS_UNICODE(v);
7222 startinpos = s-starts;
7223 endinpos = startinpos+1;
7224 if (unicode_decode_call_errorhandler(
7225 errors, &errorHandler,
7226 "charmap", "character maps to <undefined>",
7227 &starts, &e, &startinpos, &endinpos, &exc, &s,
7228 &v, &outpos, &p)) {
7229 goto onError;
7230 }
7231 continue;
7232 }
7233 *p++ = x;
7234 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007235 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007236 }
7237 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 while (s < e) {
7239 unsigned char ch = *s;
7240 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007241
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7243 w = PyLong_FromLong((long)ch);
7244 if (w == NULL)
7245 goto onError;
7246 x = PyObject_GetItem(mapping, w);
7247 Py_DECREF(w);
7248 if (x == NULL) {
7249 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7250 /* No mapping found means: mapping is undefined. */
7251 PyErr_Clear();
7252 x = Py_None;
7253 Py_INCREF(x);
7254 } else
7255 goto onError;
7256 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007257
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 /* Apply mapping */
7259 if (PyLong_Check(x)) {
7260 long value = PyLong_AS_LONG(x);
7261 if (value < 0 || value > 65535) {
7262 PyErr_SetString(PyExc_TypeError,
7263 "character mapping must be in range(65536)");
7264 Py_DECREF(x);
7265 goto onError;
7266 }
7267 *p++ = (Py_UNICODE)value;
7268 }
7269 else if (x == Py_None) {
7270 /* undefined mapping */
7271 outpos = p-PyUnicode_AS_UNICODE(v);
7272 startinpos = s-starts;
7273 endinpos = startinpos+1;
7274 if (unicode_decode_call_errorhandler(
7275 errors, &errorHandler,
7276 "charmap", "character maps to <undefined>",
7277 &starts, &e, &startinpos, &endinpos, &exc, &s,
7278 &v, &outpos, &p)) {
7279 Py_DECREF(x);
7280 goto onError;
7281 }
7282 Py_DECREF(x);
7283 continue;
7284 }
7285 else if (PyUnicode_Check(x)) {
7286 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007287
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 if (targetsize == 1)
7289 /* 1-1 mapping */
7290 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007291
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 else if (targetsize > 1) {
7293 /* 1-n mapping */
7294 if (targetsize > extrachars) {
7295 /* resize first */
7296 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7297 Py_ssize_t needed = (targetsize - extrachars) + \
7298 (targetsize << 2);
7299 extrachars += needed;
7300 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007301 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 PyUnicode_GET_SIZE(v) + needed) < 0) {
7303 Py_DECREF(x);
7304 goto onError;
7305 }
7306 p = PyUnicode_AS_UNICODE(v) + oldpos;
7307 }
7308 Py_UNICODE_COPY(p,
7309 PyUnicode_AS_UNICODE(x),
7310 targetsize);
7311 p += targetsize;
7312 extrachars -= targetsize;
7313 }
7314 /* 1-0 mapping: skip the character */
7315 }
7316 else {
7317 /* wrong return value */
7318 PyErr_SetString(PyExc_TypeError,
7319 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007320 Py_DECREF(x);
7321 goto onError;
7322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 Py_DECREF(x);
7324 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 }
7327 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007328 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007330 Py_XDECREF(errorHandler);
7331 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007332#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007333 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007334 Py_DECREF(v);
7335 return NULL;
7336 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007337#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007338 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007340
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 Py_XDECREF(errorHandler);
7343 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 Py_XDECREF(v);
7345 return NULL;
7346}
7347
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007348/* Charmap encoding: the lookup table */
7349
Alexander Belopolsky40018472011-02-26 01:02:56 +00007350struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 PyObject_HEAD
7352 unsigned char level1[32];
7353 int count2, count3;
7354 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007355};
7356
7357static PyObject*
7358encoding_map_size(PyObject *obj, PyObject* args)
7359{
7360 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007361 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007363}
7364
7365static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007366 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 PyDoc_STR("Return the size (in bytes) of this object") },
7368 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007369};
7370
7371static void
7372encoding_map_dealloc(PyObject* o)
7373{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007375}
7376
7377static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007378 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 "EncodingMap", /*tp_name*/
7380 sizeof(struct encoding_map), /*tp_basicsize*/
7381 0, /*tp_itemsize*/
7382 /* methods */
7383 encoding_map_dealloc, /*tp_dealloc*/
7384 0, /*tp_print*/
7385 0, /*tp_getattr*/
7386 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007387 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 0, /*tp_repr*/
7389 0, /*tp_as_number*/
7390 0, /*tp_as_sequence*/
7391 0, /*tp_as_mapping*/
7392 0, /*tp_hash*/
7393 0, /*tp_call*/
7394 0, /*tp_str*/
7395 0, /*tp_getattro*/
7396 0, /*tp_setattro*/
7397 0, /*tp_as_buffer*/
7398 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7399 0, /*tp_doc*/
7400 0, /*tp_traverse*/
7401 0, /*tp_clear*/
7402 0, /*tp_richcompare*/
7403 0, /*tp_weaklistoffset*/
7404 0, /*tp_iter*/
7405 0, /*tp_iternext*/
7406 encoding_map_methods, /*tp_methods*/
7407 0, /*tp_members*/
7408 0, /*tp_getset*/
7409 0, /*tp_base*/
7410 0, /*tp_dict*/
7411 0, /*tp_descr_get*/
7412 0, /*tp_descr_set*/
7413 0, /*tp_dictoffset*/
7414 0, /*tp_init*/
7415 0, /*tp_alloc*/
7416 0, /*tp_new*/
7417 0, /*tp_free*/
7418 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007419};
7420
7421PyObject*
7422PyUnicode_BuildEncodingMap(PyObject* string)
7423{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007424 PyObject *result;
7425 struct encoding_map *mresult;
7426 int i;
7427 int need_dict = 0;
7428 unsigned char level1[32];
7429 unsigned char level2[512];
7430 unsigned char *mlevel1, *mlevel2, *mlevel3;
7431 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 int kind;
7433 void *data;
7434 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007437 PyErr_BadArgument();
7438 return NULL;
7439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 kind = PyUnicode_KIND(string);
7441 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007442 memset(level1, 0xFF, sizeof level1);
7443 memset(level2, 0xFF, sizeof level2);
7444
7445 /* If there isn't a one-to-one mapping of NULL to \0,
7446 or if there are non-BMP characters, we need to use
7447 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007449 need_dict = 1;
7450 for (i = 1; i < 256; i++) {
7451 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 ch = PyUnicode_READ(kind, data, i);
7453 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007454 need_dict = 1;
7455 break;
7456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007458 /* unmapped character */
7459 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 l1 = ch >> 11;
7461 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007462 if (level1[l1] == 0xFF)
7463 level1[l1] = count2++;
7464 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007466 }
7467
7468 if (count2 >= 0xFF || count3 >= 0xFF)
7469 need_dict = 1;
7470
7471 if (need_dict) {
7472 PyObject *result = PyDict_New();
7473 PyObject *key, *value;
7474 if (!result)
7475 return NULL;
7476 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007478 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007479 if (!key || !value)
7480 goto failed1;
7481 if (PyDict_SetItem(result, key, value) == -1)
7482 goto failed1;
7483 Py_DECREF(key);
7484 Py_DECREF(value);
7485 }
7486 return result;
7487 failed1:
7488 Py_XDECREF(key);
7489 Py_XDECREF(value);
7490 Py_DECREF(result);
7491 return NULL;
7492 }
7493
7494 /* Create a three-level trie */
7495 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7496 16*count2 + 128*count3 - 1);
7497 if (!result)
7498 return PyErr_NoMemory();
7499 PyObject_Init(result, &EncodingMapType);
7500 mresult = (struct encoding_map*)result;
7501 mresult->count2 = count2;
7502 mresult->count3 = count3;
7503 mlevel1 = mresult->level1;
7504 mlevel2 = mresult->level23;
7505 mlevel3 = mresult->level23 + 16*count2;
7506 memcpy(mlevel1, level1, 32);
7507 memset(mlevel2, 0xFF, 16*count2);
7508 memset(mlevel3, 0, 128*count3);
7509 count3 = 0;
7510 for (i = 1; i < 256; i++) {
7511 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007513 /* unmapped character */
7514 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007515 o1 = PyUnicode_READ(kind, data, i)>>11;
7516 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007517 i2 = 16*mlevel1[o1] + o2;
7518 if (mlevel2[i2] == 0xFF)
7519 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007521 i3 = 128*mlevel2[i2] + o3;
7522 mlevel3[i3] = i;
7523 }
7524 return result;
7525}
7526
7527static int
7528encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7529{
7530 struct encoding_map *map = (struct encoding_map*)mapping;
7531 int l1 = c>>11;
7532 int l2 = (c>>7) & 0xF;
7533 int l3 = c & 0x7F;
7534 int i;
7535
7536#ifdef Py_UNICODE_WIDE
7537 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007539 }
7540#endif
7541 if (c == 0)
7542 return 0;
7543 /* level 1*/
7544 i = map->level1[l1];
7545 if (i == 0xFF) {
7546 return -1;
7547 }
7548 /* level 2*/
7549 i = map->level23[16*i+l2];
7550 if (i == 0xFF) {
7551 return -1;
7552 }
7553 /* level 3 */
7554 i = map->level23[16*map->count2 + 128*i + l3];
7555 if (i == 0) {
7556 return -1;
7557 }
7558 return i;
7559}
7560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561/* Lookup the character ch in the mapping. If the character
7562 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007563 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007564static PyObject *
7565charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Christian Heimes217cfd12007-12-02 14:31:20 +00007567 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568 PyObject *x;
7569
7570 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007572 x = PyObject_GetItem(mapping, w);
7573 Py_DECREF(w);
7574 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7576 /* No mapping found means: mapping is undefined. */
7577 PyErr_Clear();
7578 x = Py_None;
7579 Py_INCREF(x);
7580 return x;
7581 } else
7582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007584 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007586 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 long value = PyLong_AS_LONG(x);
7588 if (value < 0 || value > 255) {
7589 PyErr_SetString(PyExc_TypeError,
7590 "character mapping must be in range(256)");
7591 Py_DECREF(x);
7592 return NULL;
7593 }
7594 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007596 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 /* wrong return value */
7600 PyErr_Format(PyExc_TypeError,
7601 "character mapping must return integer, bytes or None, not %.400s",
7602 x->ob_type->tp_name);
7603 Py_DECREF(x);
7604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 }
7606}
7607
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007608static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007609charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007610{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7612 /* exponentially overallocate to minimize reallocations */
7613 if (requiredsize < 2*outsize)
7614 requiredsize = 2*outsize;
7615 if (_PyBytes_Resize(outobj, requiredsize))
7616 return -1;
7617 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007618}
7619
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007622} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007623/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007624 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007625 space is available. Return a new reference to the object that
7626 was put in the output buffer, or Py_None, if the mapping was undefined
7627 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007628 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007629static charmapencode_result
7630charmapencode_output(Py_UNICODE c, PyObject *mapping,
7631 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007633 PyObject *rep;
7634 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007635 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636
Christian Heimes90aa7642007-12-19 02:45:37 +00007637 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007638 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007640 if (res == -1)
7641 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 if (outsize<requiredsize)
7643 if (charmapencode_resize(outobj, outpos, requiredsize))
7644 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007645 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 outstart[(*outpos)++] = (char)res;
7647 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 }
7649
7650 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007651 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007653 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 Py_DECREF(rep);
7655 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007656 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 if (PyLong_Check(rep)) {
7658 Py_ssize_t requiredsize = *outpos+1;
7659 if (outsize<requiredsize)
7660 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7661 Py_DECREF(rep);
7662 return enc_EXCEPTION;
7663 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007664 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 else {
7668 const char *repchars = PyBytes_AS_STRING(rep);
7669 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7670 Py_ssize_t requiredsize = *outpos+repsize;
7671 if (outsize<requiredsize)
7672 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7673 Py_DECREF(rep);
7674 return enc_EXCEPTION;
7675 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007676 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 memcpy(outstart + *outpos, repchars, repsize);
7678 *outpos += repsize;
7679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007681 Py_DECREF(rep);
7682 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683}
7684
7685/* handle an error in PyUnicode_EncodeCharmap
7686 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007687static int
7688charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007689 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007691 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007692 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007693{
7694 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 Py_ssize_t repsize;
7696 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 Py_UNICODE *uni2;
7698 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007699 Py_ssize_t collstartpos = *inpos;
7700 Py_ssize_t collendpos = *inpos+1;
7701 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702 char *encoding = "charmap";
7703 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 /* find all unencodable characters */
7707 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007708 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007709 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 int res = encoding_map_lookup(p[collendpos], mapping);
7711 if (res != -1)
7712 break;
7713 ++collendpos;
7714 continue;
7715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 rep = charmapencode_lookup(p[collendpos], mapping);
7718 if (rep==NULL)
7719 return -1;
7720 else if (rep!=Py_None) {
7721 Py_DECREF(rep);
7722 break;
7723 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007724 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007726 }
7727 /* cache callback name lookup
7728 * (if not done yet, i.e. it's the first error) */
7729 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 if ((errors==NULL) || (!strcmp(errors, "strict")))
7731 *known_errorHandler = 1;
7732 else if (!strcmp(errors, "replace"))
7733 *known_errorHandler = 2;
7734 else if (!strcmp(errors, "ignore"))
7735 *known_errorHandler = 3;
7736 else if (!strcmp(errors, "xmlcharrefreplace"))
7737 *known_errorHandler = 4;
7738 else
7739 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 }
7741 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 case 1: /* strict */
7743 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7744 return -1;
7745 case 2: /* replace */
7746 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 x = charmapencode_output('?', mapping, res, respos);
7748 if (x==enc_EXCEPTION) {
7749 return -1;
7750 }
7751 else if (x==enc_FAILED) {
7752 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7753 return -1;
7754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 }
7756 /* fall through */
7757 case 3: /* ignore */
7758 *inpos = collendpos;
7759 break;
7760 case 4: /* xmlcharrefreplace */
7761 /* generate replacement (temporarily (mis)uses p) */
7762 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 char buffer[2+29+1+1];
7764 char *cp;
7765 sprintf(buffer, "&#%d;", (int)p[collpos]);
7766 for (cp = buffer; *cp; ++cp) {
7767 x = charmapencode_output(*cp, mapping, res, respos);
7768 if (x==enc_EXCEPTION)
7769 return -1;
7770 else if (x==enc_FAILED) {
7771 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7772 return -1;
7773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 }
7775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007776 *inpos = collendpos;
7777 break;
7778 default:
7779 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 encoding, reason, p, size, exceptionObject,
7781 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007784 if (PyBytes_Check(repunicode)) {
7785 /* Directly copy bytes result to output. */
7786 Py_ssize_t outsize = PyBytes_Size(*res);
7787 Py_ssize_t requiredsize;
7788 repsize = PyBytes_Size(repunicode);
7789 requiredsize = *respos + repsize;
7790 if (requiredsize > outsize)
7791 /* Make room for all additional bytes. */
7792 if (charmapencode_resize(res, respos, requiredsize)) {
7793 Py_DECREF(repunicode);
7794 return -1;
7795 }
7796 memcpy(PyBytes_AsString(*res) + *respos,
7797 PyBytes_AsString(repunicode), repsize);
7798 *respos += repsize;
7799 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007800 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007801 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007802 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007803 /* generate replacement */
7804 repsize = PyUnicode_GET_SIZE(repunicode);
7805 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 x = charmapencode_output(*uni2, mapping, res, respos);
7807 if (x==enc_EXCEPTION) {
7808 return -1;
7809 }
7810 else if (x==enc_FAILED) {
7811 Py_DECREF(repunicode);
7812 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7813 return -1;
7814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 }
7816 *inpos = newpos;
7817 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818 }
7819 return 0;
7820}
7821
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822PyObject *
7823PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7824 Py_ssize_t size,
7825 PyObject *mapping,
7826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828 /* output object */
7829 PyObject *res = NULL;
7830 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007832 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 PyObject *errorHandler = NULL;
7835 PyObject *exc = NULL;
7836 /* the following variable is used for caching string comparisons
7837 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7838 * 3=ignore, 4=xmlcharrefreplace */
7839 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840
7841 /* Default to Latin-1 */
7842 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845 /* allocate enough for a simple encoding without
7846 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007847 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007848 if (res == NULL)
7849 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007850 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 /* try to encode it */
7855 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7856 if (x==enc_EXCEPTION) /* error */
7857 goto onError;
7858 if (x==enc_FAILED) { /* unencodable character */
7859 if (charmap_encoding_error(p, size, &inpos, mapping,
7860 &exc,
7861 &known_errorHandler, &errorHandler, errors,
7862 &res, &respos)) {
7863 goto onError;
7864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 else
7867 /* done with this character => adjust input position */
7868 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007871 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007872 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007873 if (_PyBytes_Resize(&res, respos) < 0)
7874 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 Py_XDECREF(exc);
7877 Py_XDECREF(errorHandler);
7878 return res;
7879
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881 Py_XDECREF(res);
7882 Py_XDECREF(exc);
7883 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 return NULL;
7885}
7886
Alexander Belopolsky40018472011-02-26 01:02:56 +00007887PyObject *
7888PyUnicode_AsCharmapString(PyObject *unicode,
7889 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890{
7891 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 PyErr_BadArgument();
7893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 }
7895 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 PyUnicode_GET_SIZE(unicode),
7897 mapping,
7898 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899}
7900
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007901/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007902static void
7903make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007905 Py_ssize_t startpos, Py_ssize_t endpos,
7906 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007908 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 *exceptionObject = _PyUnicodeTranslateError_Create(
7910 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 }
7912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7914 goto onError;
7915 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7916 goto onError;
7917 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7918 goto onError;
7919 return;
7920 onError:
7921 Py_DECREF(*exceptionObject);
7922 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 }
7924}
7925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007927static void
7928raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930 Py_ssize_t startpos, Py_ssize_t endpos,
7931 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932{
7933 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007937}
7938
7939/* error handling callback helper:
7940 build arguments, call the callback and check the arguments,
7941 put the result into newpos and return the replacement string, which
7942 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007943static PyObject *
7944unicode_translate_call_errorhandler(const char *errors,
7945 PyObject **errorHandler,
7946 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948 Py_ssize_t startpos, Py_ssize_t endpos,
7949 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007951 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007953 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 PyObject *restuple;
7955 PyObject *resunicode;
7956
7957 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961 }
7962
7963 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967
7968 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007973 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 Py_DECREF(restuple);
7975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 }
7977 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 &resunicode, &i_newpos)) {
7979 Py_DECREF(restuple);
7980 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007984 else
7985 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7988 Py_DECREF(restuple);
7989 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007990 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 Py_INCREF(resunicode);
7992 Py_DECREF(restuple);
7993 return resunicode;
7994}
7995
7996/* Lookup the character ch in the mapping and put the result in result,
7997 which must be decrefed by the caller.
7998 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007999static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008000charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001{
Christian Heimes217cfd12007-12-02 14:31:20 +00008002 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 PyObject *x;
8004
8005 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007 x = PyObject_GetItem(mapping, w);
8008 Py_DECREF(w);
8009 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8011 /* No mapping found means: use 1:1 mapping. */
8012 PyErr_Clear();
8013 *result = NULL;
8014 return 0;
8015 } else
8016 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 }
8018 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 *result = x;
8020 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008022 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 long value = PyLong_AS_LONG(x);
8024 long max = PyUnicode_GetMax();
8025 if (value < 0 || value > max) {
8026 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008027 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 Py_DECREF(x);
8029 return -1;
8030 }
8031 *result = x;
8032 return 0;
8033 }
8034 else if (PyUnicode_Check(x)) {
8035 *result = x;
8036 return 0;
8037 }
8038 else {
8039 /* wrong return value */
8040 PyErr_SetString(PyExc_TypeError,
8041 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 Py_DECREF(x);
8043 return -1;
8044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045}
8046/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if not reallocate and adjust various state variables.
8048 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008053 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008054 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 /* exponentially overallocate to minimize reallocations */
8056 if (requiredsize < 2 * oldsize)
8057 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8059 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062 }
8063 return 0;
8064}
8065/* lookup the character, put the result in the output string and adjust
8066 various state variables. Return a new reference to the object that
8067 was put in the output buffer in *result, or Py_None, if the mapping was
8068 undefined (in which case no character was written).
8069 The called must decref result.
8070 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008071static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8073 PyObject *mapping, Py_UCS4 **output,
8074 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8078 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083 }
8084 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008086 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 }
8090 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 Py_ssize_t repsize;
8092 if (PyUnicode_READY(*res) == -1)
8093 return -1;
8094 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 if (repsize==1) {
8096 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 }
8099 else if (repsize!=0) {
8100 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 Py_ssize_t requiredsize = *opos +
8102 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 Py_ssize_t i;
8105 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 for(i = 0; i < repsize; i++)
8108 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
8111 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 return 0;
8114}
8115
Alexander Belopolsky40018472011-02-26 01:02:56 +00008116PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008117_PyUnicode_TranslateCharmap(PyObject *input,
8118 PyObject *mapping,
8119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 /* input object */
8122 char *idata;
8123 Py_ssize_t size, i;
8124 int kind;
8125 /* output buffer */
8126 Py_UCS4 *output = NULL;
8127 Py_ssize_t osize;
8128 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 char *reason = "character maps to <undefined>";
8132 PyObject *errorHandler = NULL;
8133 PyObject *exc = NULL;
8134 /* the following variable is used for caching string comparisons
8135 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8136 * 3=ignore, 4=xmlcharrefreplace */
8137 int known_errorHandler = -1;
8138
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 PyErr_BadArgument();
8141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 if (PyUnicode_READY(input) == -1)
8145 return NULL;
8146 idata = (char*)PyUnicode_DATA(input);
8147 kind = PyUnicode_KIND(input);
8148 size = PyUnicode_GET_LENGTH(input);
8149 i = 0;
8150
8151 if (size == 0) {
8152 Py_INCREF(input);
8153 return input;
8154 }
8155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 /* allocate enough for a simple 1:1 translation without
8157 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 osize = size;
8159 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8160 opos = 0;
8161 if (output == NULL) {
8162 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 /* try to encode it */
8168 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 if (charmaptranslate_output(input, i, mapping,
8170 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 Py_XDECREF(x);
8172 goto onError;
8173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 else { /* untranslatable character */
8178 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8179 Py_ssize_t repsize;
8180 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 Py_ssize_t collstart = i;
8184 Py_ssize_t collend = i+1;
8185 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 while (collend < size) {
8189 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 goto onError;
8191 Py_XDECREF(x);
8192 if (x!=Py_None)
8193 break;
8194 ++collend;
8195 }
8196 /* cache callback name lookup
8197 * (if not done yet, i.e. it's the first error) */
8198 if (known_errorHandler==-1) {
8199 if ((errors==NULL) || (!strcmp(errors, "strict")))
8200 known_errorHandler = 1;
8201 else if (!strcmp(errors, "replace"))
8202 known_errorHandler = 2;
8203 else if (!strcmp(errors, "ignore"))
8204 known_errorHandler = 3;
8205 else if (!strcmp(errors, "xmlcharrefreplace"))
8206 known_errorHandler = 4;
8207 else
8208 known_errorHandler = 0;
8209 }
8210 switch (known_errorHandler) {
8211 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 raise_translate_exception(&exc, input, collstart,
8213 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 case 2: /* replace */
8216 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 for (coll = collstart; coll<collend; coll++)
8218 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 /* fall through */
8220 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 break;
8223 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 /* generate replacement (temporarily (mis)uses i) */
8225 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 char buffer[2+29+1+1];
8227 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8229 if (charmaptranslate_makespace(&output, &osize,
8230 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 goto onError;
8232 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 break;
8237 default:
8238 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 reason, input, &exc,
8240 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008241 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 goto onError;
8243 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 repsize = PyUnicode_GET_LENGTH(repunicode);
8245 if (charmaptranslate_makespace(&output, &osize,
8246 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 Py_DECREF(repunicode);
8248 goto onError;
8249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 for (uni2 = 0; repsize-->0; ++uni2)
8251 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8252 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008255 }
8256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8258 if (!res)
8259 goto onError;
8260 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 Py_XDECREF(exc);
8262 Py_XDECREF(errorHandler);
8263 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 Py_XDECREF(exc);
8268 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 return NULL;
8270}
8271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272/* Deprecated. Use PyUnicode_Translate instead. */
8273PyObject *
8274PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8275 Py_ssize_t size,
8276 PyObject *mapping,
8277 const char *errors)
8278{
8279 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8280 if (!unicode)
8281 return NULL;
8282 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8283}
8284
Alexander Belopolsky40018472011-02-26 01:02:56 +00008285PyObject *
8286PyUnicode_Translate(PyObject *str,
8287 PyObject *mapping,
8288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
8290 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008291
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 str = PyUnicode_FromObject(str);
8293 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 Py_DECREF(str);
8297 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008298
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 Py_XDECREF(str);
8301 return NULL;
8302}
Tim Petersced69f82003-09-16 20:30:58 +00008303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008305fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306{
8307 /* No need to call PyUnicode_READY(self) because this function is only
8308 called as a callback from fixup() which does it already. */
8309 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8310 const int kind = PyUnicode_KIND(self);
8311 void *data = PyUnicode_DATA(self);
8312 Py_UCS4 maxchar = 0, ch, fixed;
8313 Py_ssize_t i;
8314
8315 for (i = 0; i < len; ++i) {
8316 ch = PyUnicode_READ(kind, data, i);
8317 fixed = 0;
8318 if (ch > 127) {
8319 if (Py_UNICODE_ISSPACE(ch))
8320 fixed = ' ';
8321 else {
8322 const int decimal = Py_UNICODE_TODECIMAL(ch);
8323 if (decimal >= 0)
8324 fixed = '0' + decimal;
8325 }
8326 if (fixed != 0) {
8327 if (fixed > maxchar)
8328 maxchar = fixed;
8329 PyUnicode_WRITE(kind, data, i, fixed);
8330 }
8331 else if (ch > maxchar)
8332 maxchar = ch;
8333 }
8334 else if (ch > maxchar)
8335 maxchar = ch;
8336 }
8337
8338 return maxchar;
8339}
8340
8341PyObject *
8342_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8343{
8344 if (!PyUnicode_Check(unicode)) {
8345 PyErr_BadInternalCall();
8346 return NULL;
8347 }
8348 if (PyUnicode_READY(unicode) == -1)
8349 return NULL;
8350 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8351 /* If the string is already ASCII, just return the same string */
8352 Py_INCREF(unicode);
8353 return unicode;
8354 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008355 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356}
8357
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008358PyObject *
8359PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8360 Py_ssize_t length)
8361{
8362 PyObject *result;
8363 Py_UNICODE *p; /* write pointer into result */
8364 Py_ssize_t i;
8365 /* Copy to a new string */
8366 result = (PyObject *)_PyUnicode_New(length);
8367 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8368 if (result == NULL)
8369 return result;
8370 p = PyUnicode_AS_UNICODE(result);
8371 /* Iterate over code points */
8372 for (i = 0; i < length; i++) {
8373 Py_UNICODE ch =s[i];
8374 if (ch > 127) {
8375 int decimal = Py_UNICODE_TODECIMAL(ch);
8376 if (decimal >= 0)
8377 p[i] = '0' + decimal;
8378 }
8379 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008380#ifndef DONT_MAKE_RESULT_READY
8381 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 Py_DECREF(result);
8383 return NULL;
8384 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008385#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008386 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008387 return result;
8388}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008389/* --- Decimal Encoder ---------------------------------------------------- */
8390
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391int
8392PyUnicode_EncodeDecimal(Py_UNICODE *s,
8393 Py_ssize_t length,
8394 char *output,
8395 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008396{
8397 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 PyObject *errorHandler = NULL;
8399 PyObject *exc = NULL;
8400 const char *encoding = "decimal";
8401 const char *reason = "invalid decimal Unicode string";
8402 /* the following variable is used for caching string comparisons
8403 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8404 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008405
8406 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 PyErr_BadArgument();
8408 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008409 }
8410
8411 p = s;
8412 end = s + length;
8413 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 register Py_UNICODE ch = *p;
8415 int decimal;
8416 PyObject *repunicode;
8417 Py_ssize_t repsize;
8418 Py_ssize_t newpos;
8419 Py_UNICODE *uni2;
8420 Py_UNICODE *collstart;
8421 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008422
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 ++p;
8426 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 decimal = Py_UNICODE_TODECIMAL(ch);
8429 if (decimal >= 0) {
8430 *output++ = '0' + decimal;
8431 ++p;
8432 continue;
8433 }
8434 if (0 < ch && ch < 256) {
8435 *output++ = (char)ch;
8436 ++p;
8437 continue;
8438 }
8439 /* All other characters are considered unencodable */
8440 collstart = p;
8441 collend = p+1;
8442 while (collend < end) {
8443 if ((0 < *collend && *collend < 256) ||
8444 !Py_UNICODE_ISSPACE(*collend) ||
8445 Py_UNICODE_TODECIMAL(*collend))
8446 break;
8447 }
8448 /* cache callback name lookup
8449 * (if not done yet, i.e. it's the first error) */
8450 if (known_errorHandler==-1) {
8451 if ((errors==NULL) || (!strcmp(errors, "strict")))
8452 known_errorHandler = 1;
8453 else if (!strcmp(errors, "replace"))
8454 known_errorHandler = 2;
8455 else if (!strcmp(errors, "ignore"))
8456 known_errorHandler = 3;
8457 else if (!strcmp(errors, "xmlcharrefreplace"))
8458 known_errorHandler = 4;
8459 else
8460 known_errorHandler = 0;
8461 }
8462 switch (known_errorHandler) {
8463 case 1: /* strict */
8464 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8465 goto onError;
8466 case 2: /* replace */
8467 for (p = collstart; p < collend; ++p)
8468 *output++ = '?';
8469 /* fall through */
8470 case 3: /* ignore */
8471 p = collend;
8472 break;
8473 case 4: /* xmlcharrefreplace */
8474 /* generate replacement (temporarily (mis)uses p) */
8475 for (p = collstart; p < collend; ++p)
8476 output += sprintf(output, "&#%d;", (int)*p);
8477 p = collend;
8478 break;
8479 default:
8480 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8481 encoding, reason, s, length, &exc,
8482 collstart-s, collend-s, &newpos);
8483 if (repunicode == NULL)
8484 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008485 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008486 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008487 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8488 Py_DECREF(repunicode);
8489 goto onError;
8490 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* generate replacement */
8492 repsize = PyUnicode_GET_SIZE(repunicode);
8493 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8494 Py_UNICODE ch = *uni2;
8495 if (Py_UNICODE_ISSPACE(ch))
8496 *output++ = ' ';
8497 else {
8498 decimal = Py_UNICODE_TODECIMAL(ch);
8499 if (decimal >= 0)
8500 *output++ = '0' + decimal;
8501 else if (0 < ch && ch < 256)
8502 *output++ = (char)ch;
8503 else {
8504 Py_DECREF(repunicode);
8505 raise_encode_exception(&exc, encoding,
8506 s, length, collstart-s, collend-s, reason);
8507 goto onError;
8508 }
8509 }
8510 }
8511 p = s + newpos;
8512 Py_DECREF(repunicode);
8513 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008514 }
8515 /* 0-terminate the output string */
8516 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 Py_XDECREF(exc);
8518 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008519 return 0;
8520
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008522 Py_XDECREF(exc);
8523 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008524 return -1;
8525}
8526
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527/* --- Helpers ------------------------------------------------------------ */
8528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008530any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_ssize_t start,
8532 Py_ssize_t end)
8533{
8534 int kind1, kind2, kind;
8535 void *buf1, *buf2;
8536 Py_ssize_t len1, len2, result;
8537
8538 kind1 = PyUnicode_KIND(s1);
8539 kind2 = PyUnicode_KIND(s2);
8540 kind = kind1 > kind2 ? kind1 : kind2;
8541 buf1 = PyUnicode_DATA(s1);
8542 buf2 = PyUnicode_DATA(s2);
8543 if (kind1 != kind)
8544 buf1 = _PyUnicode_AsKind(s1, kind);
8545 if (!buf1)
8546 return -2;
8547 if (kind2 != kind)
8548 buf2 = _PyUnicode_AsKind(s2, kind);
8549 if (!buf2) {
8550 if (kind1 != kind) PyMem_Free(buf1);
8551 return -2;
8552 }
8553 len1 = PyUnicode_GET_LENGTH(s1);
8554 len2 = PyUnicode_GET_LENGTH(s2);
8555
Victor Stinner794d5672011-10-10 03:21:36 +02008556 if (direction > 0) {
8557 switch(kind) {
8558 case PyUnicode_1BYTE_KIND:
8559 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8560 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8561 else
8562 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8563 break;
8564 case PyUnicode_2BYTE_KIND:
8565 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8566 break;
8567 case PyUnicode_4BYTE_KIND:
8568 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8569 break;
8570 default:
8571 assert(0); result = -2;
8572 }
8573 }
8574 else {
8575 switch(kind) {
8576 case PyUnicode_1BYTE_KIND:
8577 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8578 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8579 else
8580 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8581 break;
8582 case PyUnicode_2BYTE_KIND:
8583 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8584 break;
8585 case PyUnicode_4BYTE_KIND:
8586 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8587 break;
8588 default:
8589 assert(0); result = -2;
8590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 }
8592
8593 if (kind1 != kind)
8594 PyMem_Free(buf1);
8595 if (kind2 != kind)
8596 PyMem_Free(buf2);
8597
8598 return result;
8599}
8600
8601Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008602_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t n_buffer,
8604 void *digits, Py_ssize_t n_digits,
8605 Py_ssize_t min_width,
8606 const char *grouping,
8607 const char *thousands_sep)
8608{
8609 switch(kind) {
8610 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008611 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8612 return _PyUnicode_ascii_InsertThousandsGrouping(
8613 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8614 min_width, grouping, thousands_sep);
8615 else
8616 return _PyUnicode_ucs1_InsertThousandsGrouping(
8617 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8618 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 case PyUnicode_2BYTE_KIND:
8620 return _PyUnicode_ucs2_InsertThousandsGrouping(
8621 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8622 min_width, grouping, thousands_sep);
8623 case PyUnicode_4BYTE_KIND:
8624 return _PyUnicode_ucs4_InsertThousandsGrouping(
8625 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8626 min_width, grouping, thousands_sep);
8627 }
8628 assert(0);
8629 return -1;
8630}
8631
8632
Eric Smith8c663262007-08-25 02:26:07 +00008633#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008634#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008635
Thomas Wouters477c8d52006-05-27 19:21:47 +00008636#include "stringlib/count.h"
8637#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008638
Thomas Wouters477c8d52006-05-27 19:21:47 +00008639/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008640#define ADJUST_INDICES(start, end, len) \
8641 if (end > len) \
8642 end = len; \
8643 else if (end < 0) { \
8644 end += len; \
8645 if (end < 0) \
8646 end = 0; \
8647 } \
8648 if (start < 0) { \
8649 start += len; \
8650 if (start < 0) \
8651 start = 0; \
8652 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008653
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654Py_ssize_t
8655PyUnicode_Count(PyObject *str,
8656 PyObject *substr,
8657 Py_ssize_t start,
8658 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008660 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661 PyUnicodeObject* str_obj;
8662 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 int kind1, kind2, kind;
8664 void *buf1 = NULL, *buf2 = NULL;
8665 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008666
Thomas Wouters477c8d52006-05-27 19:21:47 +00008667 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008670 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008671 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 Py_DECREF(str_obj);
8673 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
Tim Petersced69f82003-09-16 20:30:58 +00008675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 kind1 = PyUnicode_KIND(str_obj);
8677 kind2 = PyUnicode_KIND(sub_obj);
8678 kind = kind1 > kind2 ? kind1 : kind2;
8679 buf1 = PyUnicode_DATA(str_obj);
8680 if (kind1 != kind)
8681 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8682 if (!buf1)
8683 goto onError;
8684 buf2 = PyUnicode_DATA(sub_obj);
8685 if (kind2 != kind)
8686 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8687 if (!buf2)
8688 goto onError;
8689 len1 = PyUnicode_GET_LENGTH(str_obj);
8690 len2 = PyUnicode_GET_LENGTH(sub_obj);
8691
8692 ADJUST_INDICES(start, end, len1);
8693 switch(kind) {
8694 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008695 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8696 result = asciilib_count(
8697 ((Py_UCS1*)buf1) + start, end - start,
8698 buf2, len2, PY_SSIZE_T_MAX
8699 );
8700 else
8701 result = ucs1lib_count(
8702 ((Py_UCS1*)buf1) + start, end - start,
8703 buf2, len2, PY_SSIZE_T_MAX
8704 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 break;
8706 case PyUnicode_2BYTE_KIND:
8707 result = ucs2lib_count(
8708 ((Py_UCS2*)buf1) + start, end - start,
8709 buf2, len2, PY_SSIZE_T_MAX
8710 );
8711 break;
8712 case PyUnicode_4BYTE_KIND:
8713 result = ucs4lib_count(
8714 ((Py_UCS4*)buf1) + start, end - start,
8715 buf2, len2, PY_SSIZE_T_MAX
8716 );
8717 break;
8718 default:
8719 assert(0); result = 0;
8720 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008721
8722 Py_DECREF(sub_obj);
8723 Py_DECREF(str_obj);
8724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 if (kind1 != kind)
8726 PyMem_Free(buf1);
8727 if (kind2 != kind)
8728 PyMem_Free(buf2);
8729
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 onError:
8732 Py_DECREF(sub_obj);
8733 Py_DECREF(str_obj);
8734 if (kind1 != kind && buf1)
8735 PyMem_Free(buf1);
8736 if (kind2 != kind && buf2)
8737 PyMem_Free(buf2);
8738 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741Py_ssize_t
8742PyUnicode_Find(PyObject *str,
8743 PyObject *sub,
8744 Py_ssize_t start,
8745 Py_ssize_t end,
8746 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008748 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008749
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008753 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 Py_DECREF(str);
8756 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 }
Tim Petersced69f82003-09-16 20:30:58 +00008758
Victor Stinner794d5672011-10-10 03:21:36 +02008759 result = any_find_slice(direction,
8760 str, sub, start, end
8761 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008762
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764 Py_DECREF(sub);
8765
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 return result;
8767}
8768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769Py_ssize_t
8770PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8771 Py_ssize_t start, Py_ssize_t end,
8772 int direction)
8773{
8774 char *result;
8775 int kind;
8776 if (PyUnicode_READY(str) == -1)
8777 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008778 if (start < 0 || end < 0) {
8779 PyErr_SetString(PyExc_IndexError, "string index out of range");
8780 return -2;
8781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 if (end > PyUnicode_GET_LENGTH(str))
8783 end = PyUnicode_GET_LENGTH(str);
8784 kind = PyUnicode_KIND(str);
8785 result = findchar(PyUnicode_1BYTE_DATA(str)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008786 + kind*start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 kind,
8788 end-start, ch, direction);
8789 if (!result)
8790 return -1;
8791 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8792}
8793
Alexander Belopolsky40018472011-02-26 01:02:56 +00008794static int
8795tailmatch(PyUnicodeObject *self,
8796 PyUnicodeObject *substring,
8797 Py_ssize_t start,
8798 Py_ssize_t end,
8799 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 int kind_self;
8802 int kind_sub;
8803 void *data_self;
8804 void *data_sub;
8805 Py_ssize_t offset;
8806 Py_ssize_t i;
8807 Py_ssize_t end_sub;
8808
8809 if (PyUnicode_READY(self) == -1 ||
8810 PyUnicode_READY(substring) == -1)
8811 return 0;
8812
8813 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 return 1;
8815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8817 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 kind_self = PyUnicode_KIND(self);
8822 data_self = PyUnicode_DATA(self);
8823 kind_sub = PyUnicode_KIND(substring);
8824 data_sub = PyUnicode_DATA(substring);
8825 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8826
8827 if (direction > 0)
8828 offset = end;
8829 else
8830 offset = start;
8831
8832 if (PyUnicode_READ(kind_self, data_self, offset) ==
8833 PyUnicode_READ(kind_sub, data_sub, 0) &&
8834 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8835 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8836 /* If both are of the same kind, memcmp is sufficient */
8837 if (kind_self == kind_sub) {
8838 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008839 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 data_sub,
8841 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008842 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 }
8844 /* otherwise we have to compare each character by first accesing it */
8845 else {
8846 /* We do not need to compare 0 and len(substring)-1 because
8847 the if statement above ensured already that they are equal
8848 when we end up here. */
8849 // TODO: honor direction and do a forward or backwards search
8850 for (i = 1; i < end_sub; ++i) {
8851 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8852 PyUnicode_READ(kind_sub, data_sub, i))
8853 return 0;
8854 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 }
8858
8859 return 0;
8860}
8861
Alexander Belopolsky40018472011-02-26 01:02:56 +00008862Py_ssize_t
8863PyUnicode_Tailmatch(PyObject *str,
8864 PyObject *substr,
8865 Py_ssize_t start,
8866 Py_ssize_t end,
8867 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 str = PyUnicode_FromObject(str);
8872 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 substr = PyUnicode_FromObject(substr);
8875 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 Py_DECREF(str);
8877 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 }
Tim Petersced69f82003-09-16 20:30:58 +00008879
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 (PyUnicodeObject *)substr,
8882 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 Py_DECREF(str);
8884 Py_DECREF(substr);
8885 return result;
8886}
8887
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888/* Apply fixfct filter to the Unicode object self and return a
8889 reference to the modified object */
8890
Alexander Belopolsky40018472011-02-26 01:02:56 +00008891static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008892fixup(PyObject *self,
8893 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 PyObject *u;
8896 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (PyUnicode_READY(self) == -1)
8899 return NULL;
8900 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8901 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8902 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008907 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 /* fix functions return the new maximum character in a string,
8910 if the kind of the resulting unicode object does not change,
8911 everything is fine. Otherwise we need to change the string kind
8912 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008913 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 if (maxchar_new == 0)
8915 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8916 else if (maxchar_new <= 127)
8917 maxchar_new = 127;
8918 else if (maxchar_new <= 255)
8919 maxchar_new = 255;
8920 else if (maxchar_new <= 65535)
8921 maxchar_new = 65535;
8922 else
8923 maxchar_new = 1114111; /* 0x10ffff */
8924
8925 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 /* fixfct should return TRUE if it modified the buffer. If
8927 FALSE, return a reference to the original buffer instead
8928 (to save space, not time) */
8929 Py_INCREF(self);
8930 Py_DECREF(u);
8931 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 else if (maxchar_new == maxchar_old) {
8934 return u;
8935 }
8936 else {
8937 /* In case the maximum character changed, we need to
8938 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008939 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 if (v == NULL) {
8941 Py_DECREF(u);
8942 return NULL;
8943 }
8944 if (maxchar_new > maxchar_old) {
8945 /* If the maxchar increased so that the kind changed, not all
8946 characters are representable anymore and we need to fix the
8947 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008948 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008949 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8951 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008952 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008953 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
8956 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008957 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 return v;
8959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008963fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 /* No need to call PyUnicode_READY(self) because this function is only
8966 called as a callback from fixup() which does it already. */
8967 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8968 const int kind = PyUnicode_KIND(self);
8969 void *data = PyUnicode_DATA(self);
8970 int touched = 0;
8971 Py_UCS4 maxchar = 0;
8972 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 for (i = 0; i < len; ++i) {
8975 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8976 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8977 if (up != ch) {
8978 if (up > maxchar)
8979 maxchar = up;
8980 PyUnicode_WRITE(kind, data, i, up);
8981 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 else if (ch > maxchar)
8984 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 }
8986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 if (touched)
8988 return maxchar;
8989 else
8990 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991}
8992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008994fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8997 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8998 const int kind = PyUnicode_KIND(self);
8999 void *data = PyUnicode_DATA(self);
9000 int touched = 0;
9001 Py_UCS4 maxchar = 0;
9002 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 for(i = 0; i < len; ++i) {
9005 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9006 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9007 if (lo != ch) {
9008 if (lo > maxchar)
9009 maxchar = lo;
9010 PyUnicode_WRITE(kind, data, i, lo);
9011 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 else if (ch > maxchar)
9014 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 }
9016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 if (touched)
9018 return maxchar;
9019 else
9020 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021}
9022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009024fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9027 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9028 const int kind = PyUnicode_KIND(self);
9029 void *data = PyUnicode_DATA(self);
9030 int touched = 0;
9031 Py_UCS4 maxchar = 0;
9032 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 for(i = 0; i < len; ++i) {
9035 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9036 Py_UCS4 nu = 0;
9037
9038 if (Py_UNICODE_ISUPPER(ch))
9039 nu = Py_UNICODE_TOLOWER(ch);
9040 else if (Py_UNICODE_ISLOWER(ch))
9041 nu = Py_UNICODE_TOUPPER(ch);
9042
9043 if (nu != 0) {
9044 if (nu > maxchar)
9045 maxchar = nu;
9046 PyUnicode_WRITE(kind, data, i, nu);
9047 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 else if (ch > maxchar)
9050 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 }
9052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (touched)
9054 return maxchar;
9055 else
9056 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057}
9058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009060fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9063 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9064 const int kind = PyUnicode_KIND(self);
9065 void *data = PyUnicode_DATA(self);
9066 int touched = 0;
9067 Py_UCS4 maxchar = 0;
9068 Py_ssize_t i = 0;
9069 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009070
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009071 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073
9074 ch = PyUnicode_READ(kind, data, i);
9075 if (!Py_UNICODE_ISUPPER(ch)) {
9076 maxchar = Py_UNICODE_TOUPPER(ch);
9077 PyUnicode_WRITE(kind, data, i, maxchar);
9078 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 ++i;
9081 for(; i < len; ++i) {
9082 ch = PyUnicode_READ(kind, data, i);
9083 if (!Py_UNICODE_ISLOWER(ch)) {
9084 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9085 if (lo > maxchar)
9086 maxchar = lo;
9087 PyUnicode_WRITE(kind, data, i, lo);
9088 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 else if (ch > maxchar)
9091 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093
9094 if (touched)
9095 return maxchar;
9096 else
9097 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098}
9099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009101fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9104 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9105 const int kind = PyUnicode_KIND(self);
9106 void *data = PyUnicode_DATA(self);
9107 Py_UCS4 maxchar = 0;
9108 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 int previous_is_cased;
9110
9111 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 if (len == 1) {
9113 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9114 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9115 if (ti != ch) {
9116 PyUnicode_WRITE(kind, data, i, ti);
9117 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 }
9119 else
9120 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 for(; i < len; ++i) {
9124 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9125 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009126
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 nu = Py_UNICODE_TOTITLE(ch);
9131
9132 if (nu > maxchar)
9133 maxchar = nu;
9134 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009135
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 if (Py_UNICODE_ISLOWER(ch) ||
9137 Py_UNICODE_ISUPPER(ch) ||
9138 Py_UNICODE_ISTITLE(ch))
9139 previous_is_cased = 1;
9140 else
9141 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144}
9145
Tim Peters8ce9f162004-08-27 01:49:32 +00009146PyObject *
9147PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009150 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009152 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009153 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9154 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009155 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009157 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009159 int use_memcpy;
9160 unsigned char *res_data = NULL, *sep_data = NULL;
9161 PyObject *last_obj;
9162 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163
Tim Peters05eba1f2004-08-27 21:32:02 +00009164 fseq = PySequence_Fast(seq, "");
9165 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009166 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009167 }
9168
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009169 /* NOTE: the following code can't call back into Python code,
9170 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009171 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009172
Tim Peters05eba1f2004-08-27 21:32:02 +00009173 seqlen = PySequence_Fast_GET_SIZE(fseq);
9174 /* If empty sequence, return u"". */
9175 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009176 Py_DECREF(fseq);
9177 Py_INCREF(unicode_empty);
9178 res = unicode_empty;
9179 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009180 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009181
Tim Peters05eba1f2004-08-27 21:32:02 +00009182 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009183 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009184 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009185 if (seqlen == 1) {
9186 if (PyUnicode_CheckExact(items[0])) {
9187 res = items[0];
9188 Py_INCREF(res);
9189 Py_DECREF(fseq);
9190 return res;
9191 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009192 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009193 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009194 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009195 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009196 /* Set up sep and seplen */
9197 if (separator == NULL) {
9198 /* fall back to a blank space separator */
9199 sep = PyUnicode_FromOrdinal(' ');
9200 if (!sep)
9201 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009202 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009203 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009204 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009205 else {
9206 if (!PyUnicode_Check(separator)) {
9207 PyErr_Format(PyExc_TypeError,
9208 "separator: expected str instance,"
9209 " %.80s found",
9210 Py_TYPE(separator)->tp_name);
9211 goto onError;
9212 }
9213 if (PyUnicode_READY(separator))
9214 goto onError;
9215 sep = separator;
9216 seplen = PyUnicode_GET_LENGTH(separator);
9217 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9218 /* inc refcount to keep this code path symmetric with the
9219 above case of a blank separator */
9220 Py_INCREF(sep);
9221 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009222 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009223 }
9224
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009225 /* There are at least two things to join, or else we have a subclass
9226 * of str in the sequence.
9227 * Do a pre-pass to figure out the total amount of space we'll
9228 * need (sz), and see whether all argument are strings.
9229 */
9230 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009231#ifdef Py_DEBUG
9232 use_memcpy = 0;
9233#else
9234 use_memcpy = 1;
9235#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009236 for (i = 0; i < seqlen; i++) {
9237 const Py_ssize_t old_sz = sz;
9238 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 if (!PyUnicode_Check(item)) {
9240 PyErr_Format(PyExc_TypeError,
9241 "sequence item %zd: expected str instance,"
9242 " %.80s found",
9243 i, Py_TYPE(item)->tp_name);
9244 goto onError;
9245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (PyUnicode_READY(item) == -1)
9247 goto onError;
9248 sz += PyUnicode_GET_LENGTH(item);
9249 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009250 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009251 if (i != 0)
9252 sz += seplen;
9253 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9254 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009255 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009256 goto onError;
9257 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009258 if (use_memcpy && last_obj != NULL) {
9259 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9260 use_memcpy = 0;
9261 }
9262 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009263 }
Tim Petersced69f82003-09-16 20:30:58 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009266 if (res == NULL)
9267 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009268
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009269 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009270#ifdef Py_DEBUG
9271 use_memcpy = 0;
9272#else
9273 if (use_memcpy) {
9274 res_data = PyUnicode_1BYTE_DATA(res);
9275 kind = PyUnicode_KIND(res);
9276 if (seplen != 0)
9277 sep_data = PyUnicode_1BYTE_DATA(sep);
9278 }
9279#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009281 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009282 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009283 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009284 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009285 if (use_memcpy) {
9286 Py_MEMCPY(res_data,
9287 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009288 kind * seplen);
9289 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009290 }
9291 else {
9292 copy_characters(res, res_offset, sep, 0, seplen);
9293 res_offset += seplen;
9294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009296 itemlen = PyUnicode_GET_LENGTH(item);
9297 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009298 if (use_memcpy) {
9299 Py_MEMCPY(res_data,
9300 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009301 kind * itemlen);
9302 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009303 }
9304 else {
9305 copy_characters(res, res_offset, item, 0, itemlen);
9306 res_offset += itemlen;
9307 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009308 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009309 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009310 if (use_memcpy)
9311 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009312 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009313 else
9314 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009315
Tim Peters05eba1f2004-08-27 21:32:02 +00009316 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009318 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009322 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009324 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 return NULL;
9326}
9327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328#define FILL(kind, data, value, start, length) \
9329 do { \
9330 Py_ssize_t i_ = 0; \
9331 assert(kind != PyUnicode_WCHAR_KIND); \
9332 switch ((kind)) { \
9333 case PyUnicode_1BYTE_KIND: { \
9334 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9335 memset(to_, (unsigned char)value, length); \
9336 break; \
9337 } \
9338 case PyUnicode_2BYTE_KIND: { \
9339 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9340 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9341 break; \
9342 } \
9343 default: { \
9344 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9345 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9346 break; \
9347 } \
9348 } \
9349 } while (0)
9350
Victor Stinner9310abb2011-10-05 00:59:23 +02009351static PyObject *
9352pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353 Py_ssize_t left,
9354 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 PyObject *u;
9358 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009359 int kind;
9360 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361
9362 if (left < 0)
9363 left = 0;
9364 if (right < 0)
9365 right = 0;
9366
Tim Peters7a29bd52001-09-12 03:03:31 +00009367 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 Py_INCREF(self);
9369 return self;
9370 }
9371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9373 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009374 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9375 return NULL;
9376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9378 if (fill > maxchar)
9379 maxchar = fill;
9380 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009381 if (!u)
9382 return NULL;
9383
9384 kind = PyUnicode_KIND(u);
9385 data = PyUnicode_DATA(u);
9386 if (left)
9387 FILL(kind, data, fill, 0, left);
9388 if (right)
9389 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009390 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009391 assert(_PyUnicode_CheckConsistency(u, 1));
9392 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395
Alexander Belopolsky40018472011-02-26 01:02:56 +00009396PyObject *
9397PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400
9401 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 switch(PyUnicode_KIND(string)) {
9406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009407 if (PyUnicode_IS_ASCII(string))
9408 list = asciilib_splitlines(
9409 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9410 PyUnicode_GET_LENGTH(string), keepends);
9411 else
9412 list = ucs1lib_splitlines(
9413 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9414 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 break;
9416 case PyUnicode_2BYTE_KIND:
9417 list = ucs2lib_splitlines(
9418 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9419 PyUnicode_GET_LENGTH(string), keepends);
9420 break;
9421 case PyUnicode_4BYTE_KIND:
9422 list = ucs4lib_splitlines(
9423 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9424 PyUnicode_GET_LENGTH(string), keepends);
9425 break;
9426 default:
9427 assert(0);
9428 list = 0;
9429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430 Py_DECREF(string);
9431 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432}
9433
Alexander Belopolsky40018472011-02-26 01:02:56 +00009434static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009435split(PyObject *self,
9436 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009437 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 int kind1, kind2, kind;
9440 void *buf1, *buf2;
9441 Py_ssize_t len1, len2;
9442 PyObject* out;
9443
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009445 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 if (PyUnicode_READY(self) == -1)
9448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 if (substring == NULL)
9451 switch(PyUnicode_KIND(self)) {
9452 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009453 if (PyUnicode_IS_ASCII(self))
9454 return asciilib_split_whitespace(
9455 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9456 PyUnicode_GET_LENGTH(self), maxcount
9457 );
9458 else
9459 return ucs1lib_split_whitespace(
9460 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9461 PyUnicode_GET_LENGTH(self), maxcount
9462 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 case PyUnicode_2BYTE_KIND:
9464 return ucs2lib_split_whitespace(
9465 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9466 PyUnicode_GET_LENGTH(self), maxcount
9467 );
9468 case PyUnicode_4BYTE_KIND:
9469 return ucs4lib_split_whitespace(
9470 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9471 PyUnicode_GET_LENGTH(self), maxcount
9472 );
9473 default:
9474 assert(0);
9475 return NULL;
9476 }
9477
9478 if (PyUnicode_READY(substring) == -1)
9479 return NULL;
9480
9481 kind1 = PyUnicode_KIND(self);
9482 kind2 = PyUnicode_KIND(substring);
9483 kind = kind1 > kind2 ? kind1 : kind2;
9484 buf1 = PyUnicode_DATA(self);
9485 buf2 = PyUnicode_DATA(substring);
9486 if (kind1 != kind)
9487 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9488 if (!buf1)
9489 return NULL;
9490 if (kind2 != kind)
9491 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9492 if (!buf2) {
9493 if (kind1 != kind) PyMem_Free(buf1);
9494 return NULL;
9495 }
9496 len1 = PyUnicode_GET_LENGTH(self);
9497 len2 = PyUnicode_GET_LENGTH(substring);
9498
9499 switch(kind) {
9500 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009501 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9502 out = asciilib_split(
9503 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9504 else
9505 out = ucs1lib_split(
9506 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 break;
9508 case PyUnicode_2BYTE_KIND:
9509 out = ucs2lib_split(
9510 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9511 break;
9512 case PyUnicode_4BYTE_KIND:
9513 out = ucs4lib_split(
9514 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9515 break;
9516 default:
9517 out = NULL;
9518 }
9519 if (kind1 != kind)
9520 PyMem_Free(buf1);
9521 if (kind2 != kind)
9522 PyMem_Free(buf2);
9523 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524}
9525
Alexander Belopolsky40018472011-02-26 01:02:56 +00009526static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009527rsplit(PyObject *self,
9528 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009529 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 int kind1, kind2, kind;
9532 void *buf1, *buf2;
9533 Py_ssize_t len1, len2;
9534 PyObject* out;
9535
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009536 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009537 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 if (PyUnicode_READY(self) == -1)
9540 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 if (substring == NULL)
9543 switch(PyUnicode_KIND(self)) {
9544 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009545 if (PyUnicode_IS_ASCII(self))
9546 return asciilib_rsplit_whitespace(
9547 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9548 PyUnicode_GET_LENGTH(self), maxcount
9549 );
9550 else
9551 return ucs1lib_rsplit_whitespace(
9552 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9553 PyUnicode_GET_LENGTH(self), maxcount
9554 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 case PyUnicode_2BYTE_KIND:
9556 return ucs2lib_rsplit_whitespace(
9557 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9558 PyUnicode_GET_LENGTH(self), maxcount
9559 );
9560 case PyUnicode_4BYTE_KIND:
9561 return ucs4lib_rsplit_whitespace(
9562 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9563 PyUnicode_GET_LENGTH(self), maxcount
9564 );
9565 default:
9566 assert(0);
9567 return NULL;
9568 }
9569
9570 if (PyUnicode_READY(substring) == -1)
9571 return NULL;
9572
9573 kind1 = PyUnicode_KIND(self);
9574 kind2 = PyUnicode_KIND(substring);
9575 kind = kind1 > kind2 ? kind1 : kind2;
9576 buf1 = PyUnicode_DATA(self);
9577 buf2 = PyUnicode_DATA(substring);
9578 if (kind1 != kind)
9579 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9580 if (!buf1)
9581 return NULL;
9582 if (kind2 != kind)
9583 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9584 if (!buf2) {
9585 if (kind1 != kind) PyMem_Free(buf1);
9586 return NULL;
9587 }
9588 len1 = PyUnicode_GET_LENGTH(self);
9589 len2 = PyUnicode_GET_LENGTH(substring);
9590
9591 switch(kind) {
9592 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009593 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9594 out = asciilib_rsplit(
9595 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9596 else
9597 out = ucs1lib_rsplit(
9598 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 break;
9600 case PyUnicode_2BYTE_KIND:
9601 out = ucs2lib_rsplit(
9602 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9603 break;
9604 case PyUnicode_4BYTE_KIND:
9605 out = ucs4lib_rsplit(
9606 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9607 break;
9608 default:
9609 out = NULL;
9610 }
9611 if (kind1 != kind)
9612 PyMem_Free(buf1);
9613 if (kind2 != kind)
9614 PyMem_Free(buf2);
9615 return out;
9616}
9617
9618static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9620 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621{
9622 switch(kind) {
9623 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009624 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9625 return asciilib_find(buf1, len1, buf2, len2, offset);
9626 else
9627 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 case PyUnicode_2BYTE_KIND:
9629 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9630 case PyUnicode_4BYTE_KIND:
9631 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9632 }
9633 assert(0);
9634 return -1;
9635}
9636
9637static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009638anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9639 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640{
9641 switch(kind) {
9642 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009643 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9644 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9645 else
9646 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 case PyUnicode_2BYTE_KIND:
9648 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9649 case PyUnicode_4BYTE_KIND:
9650 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9651 }
9652 assert(0);
9653 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009654}
9655
Alexander Belopolsky40018472011-02-26 01:02:56 +00009656static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657replace(PyObject *self, PyObject *str1,
9658 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 PyObject *u;
9661 char *sbuf = PyUnicode_DATA(self);
9662 char *buf1 = PyUnicode_DATA(str1);
9663 char *buf2 = PyUnicode_DATA(str2);
9664 int srelease = 0, release1 = 0, release2 = 0;
9665 int skind = PyUnicode_KIND(self);
9666 int kind1 = PyUnicode_KIND(str1);
9667 int kind2 = PyUnicode_KIND(str2);
9668 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9669 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9670 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009671 int mayshrink;
9672 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673
9674 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009677 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678
Victor Stinner59de0ee2011-10-07 10:01:28 +02009679 if (str1 == str2)
9680 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 if (skind < kind1)
9682 /* substring too wide to be present */
9683 goto nothing;
9684
Victor Stinner49a0a212011-10-12 23:46:10 +02009685 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9686 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9687 /* Replacing str1 with str2 may cause a maxchar reduction in the
9688 result string. */
9689 mayshrink = (maxchar_str2 < maxchar);
9690 maxchar = Py_MAX(maxchar, maxchar_str2);
9691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009693 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009694 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009696 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009698 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009699 Py_UCS4 u1, u2;
9700 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 u1 = PyUnicode_READ_CHAR(str1, 0);
9702 if (!findchar(sbuf, PyUnicode_KIND(self),
9703 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009704 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009707 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009709 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 rkind = PyUnicode_KIND(u);
9711 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9712 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009713 if (--maxcount < 0)
9714 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009716 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009717 }
9718 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 int rkind = skind;
9720 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 if (kind1 < rkind) {
9723 /* widen substring */
9724 buf1 = _PyUnicode_AsKind(str1, rkind);
9725 if (!buf1) goto error;
9726 release1 = 1;
9727 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009728 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009729 if (i < 0)
9730 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 if (rkind > kind2) {
9732 /* widen replacement */
9733 buf2 = _PyUnicode_AsKind(str2, rkind);
9734 if (!buf2) goto error;
9735 release2 = 1;
9736 }
9737 else if (rkind < kind2) {
9738 /* widen self and buf1 */
9739 rkind = kind2;
9740 if (release1) PyMem_Free(buf1);
9741 sbuf = _PyUnicode_AsKind(self, rkind);
9742 if (!sbuf) goto error;
9743 srelease = 1;
9744 buf1 = _PyUnicode_AsKind(str1, rkind);
9745 if (!buf1) goto error;
9746 release1 = 1;
9747 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009748 u = PyUnicode_New(slen, maxchar);
9749 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009751 assert(PyUnicode_KIND(u) == rkind);
9752 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009753
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009754 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009755 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009756 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009758 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009760
9761 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009762 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009763 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009765 if (i == -1)
9766 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009767 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009773 }
9774 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 Py_ssize_t n, i, j, ires;
9776 Py_ssize_t product, new_size;
9777 int rkind = skind;
9778 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009781 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 buf1 = _PyUnicode_AsKind(str1, rkind);
9783 if (!buf1) goto error;
9784 release1 = 1;
9785 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009786 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009787 if (n == 0)
9788 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009790 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 buf2 = _PyUnicode_AsKind(str2, rkind);
9792 if (!buf2) goto error;
9793 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009796 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 rkind = kind2;
9798 sbuf = _PyUnicode_AsKind(self, rkind);
9799 if (!sbuf) goto error;
9800 srelease = 1;
9801 if (release1) PyMem_Free(buf1);
9802 buf1 = _PyUnicode_AsKind(str1, rkind);
9803 if (!buf1) goto error;
9804 release1 = 1;
9805 }
9806 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9807 PyUnicode_GET_LENGTH(str1))); */
9808 product = n * (len2-len1);
9809 if ((product / (len2-len1)) != n) {
9810 PyErr_SetString(PyExc_OverflowError,
9811 "replace string is too long");
9812 goto error;
9813 }
9814 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +02009815 if (new_size == 0) {
9816 Py_INCREF(unicode_empty);
9817 u = unicode_empty;
9818 goto done;
9819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9821 PyErr_SetString(PyExc_OverflowError,
9822 "replace string is too long");
9823 goto error;
9824 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009825 u = PyUnicode_New(new_size, maxchar);
9826 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009828 assert(PyUnicode_KIND(u) == rkind);
9829 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 ires = i = 0;
9831 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009832 while (n-- > 0) {
9833 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009835 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009837 if (j == -1)
9838 break;
9839 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009841 memcpy(res + rkind * ires,
9842 sbuf + rkind * i,
9843 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009845 }
9846 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009848 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009850 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009856 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009857 memcpy(res + rkind * ires,
9858 sbuf + rkind * i,
9859 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +02009860 }
9861 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 /* interleave */
9863 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009864 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009866 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009868 if (--n <= 0)
9869 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009870 memcpy(res + rkind * ires,
9871 sbuf + rkind * i,
9872 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 ires++;
9874 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009875 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009876 memcpy(res + rkind * ires,
9877 sbuf + rkind * i,
9878 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009879 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009880 }
9881
9882 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009883 unicode_adjust_maxchar(&u);
9884 if (u == NULL)
9885 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009887
9888 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (srelease)
9890 PyMem_FREE(sbuf);
9891 if (release1)
9892 PyMem_FREE(buf1);
9893 if (release2)
9894 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009895 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009897
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009899 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (srelease)
9901 PyMem_FREE(sbuf);
9902 if (release1)
9903 PyMem_FREE(buf1);
9904 if (release2)
9905 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906 if (PyUnicode_CheckExact(self)) {
9907 Py_INCREF(self);
9908 return (PyObject *) self;
9909 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009910 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 error:
9912 if (srelease && sbuf)
9913 PyMem_FREE(sbuf);
9914 if (release1 && buf1)
9915 PyMem_FREE(buf1);
9916 if (release2 && buf2)
9917 PyMem_FREE(buf2);
9918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919}
9920
9921/* --- Unicode Object Methods --------------------------------------------- */
9922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009923PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009924 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925\n\
9926Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009927characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928
9929static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009930unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 return fixup(self, fixtitle);
9933}
9934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009935PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937\n\
9938Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009939have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
9941static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009942unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 return fixup(self, fixcapitalize);
9945}
9946
9947#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009948PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950\n\
9951Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009952normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
9954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009955unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956{
9957 PyObject *list;
9958 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009959 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961 /* Split into words */
9962 list = split(self, NULL, -1);
9963 if (!list)
9964 return NULL;
9965
9966 /* Capitalize each word */
9967 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9968 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970 if (item == NULL)
9971 goto onError;
9972 Py_DECREF(PyList_GET_ITEM(list, i));
9973 PyList_SET_ITEM(list, i, item);
9974 }
9975
9976 /* Join the words to form a new string */
9977 item = PyUnicode_Join(NULL, list);
9978
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980 Py_DECREF(list);
9981 return (PyObject *)item;
9982}
9983#endif
9984
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009985/* Argument converter. Coerces to a single unicode character */
9986
9987static int
9988convert_uc(PyObject *obj, void *addr)
9989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009992
Benjamin Peterson14339b62009-01-31 16:36:08 +00009993 uniobj = PyUnicode_FromObject(obj);
9994 if (uniobj == NULL) {
9995 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009997 return 0;
9998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010000 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010001 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010002 Py_DECREF(uniobj);
10003 return 0;
10004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010006 Py_DECREF(uniobj);
10007 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010008}
10009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010010PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010013Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010014done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
10016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010017unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010019 Py_ssize_t marg, left;
10020 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 Py_UCS4 fillchar = ' ';
10022
Victor Stinnere9a29352011-10-01 02:14:59 +020010023 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
Victor Stinnere9a29352011-10-01 02:14:59 +020010026 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 return NULL;
10028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 Py_INCREF(self);
10031 return (PyObject*) self;
10032 }
10033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 left = marg / 2 + (marg & width & 1);
10036
Victor Stinner9310abb2011-10-05 00:59:23 +020010037 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038}
10039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040/* This function assumes that str1 and str2 are readied by the caller. */
10041
Marc-André Lemburge5034372000-08-08 08:04:29 +000010042static int
10043unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 int kind1, kind2;
10046 void *data1, *data2;
10047 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 kind1 = PyUnicode_KIND(str1);
10050 kind2 = PyUnicode_KIND(str2);
10051 data1 = PyUnicode_DATA(str1);
10052 data2 = PyUnicode_DATA(str2);
10053 len1 = PyUnicode_GET_LENGTH(str1);
10054 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 for (i = 0; i < len1 && i < len2; ++i) {
10057 Py_UCS4 c1, c2;
10058 c1 = PyUnicode_READ(kind1, data1, i);
10059 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010060
10061 if (c1 != c2)
10062 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010063 }
10064
10065 return (len1 < len2) ? -1 : (len1 != len2);
10066}
10067
Alexander Belopolsky40018472011-02-26 01:02:56 +000010068int
10069PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10072 if (PyUnicode_READY(left) == -1 ||
10073 PyUnicode_READY(right) == -1)
10074 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010075 return unicode_compare((PyUnicodeObject *)left,
10076 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010078 PyErr_Format(PyExc_TypeError,
10079 "Can't compare %.100s and %.100s",
10080 left->ob_type->tp_name,
10081 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 return -1;
10083}
10084
Martin v. Löwis5b222132007-06-10 09:51:05 +000010085int
10086PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 Py_ssize_t i;
10089 int kind;
10090 void *data;
10091 Py_UCS4 chr;
10092
Victor Stinner910337b2011-10-03 03:20:16 +020010093 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 if (PyUnicode_READY(uni) == -1)
10095 return -1;
10096 kind = PyUnicode_KIND(uni);
10097 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010098 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10100 if (chr != str[i])
10101 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010102 /* This check keeps Python strings that end in '\0' from comparing equal
10103 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010105 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010106 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010108 return 0;
10109}
10110
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010111
Benjamin Peterson29060642009-01-31 22:14:21 +000010112#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010113 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010114
Alexander Belopolsky40018472011-02-26 01:02:56 +000010115PyObject *
10116PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010117{
10118 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010119
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010120 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10121 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (PyUnicode_READY(left) == -1 ||
10123 PyUnicode_READY(right) == -1)
10124 return NULL;
10125 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10126 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010127 if (op == Py_EQ) {
10128 Py_INCREF(Py_False);
10129 return Py_False;
10130 }
10131 if (op == Py_NE) {
10132 Py_INCREF(Py_True);
10133 return Py_True;
10134 }
10135 }
10136 if (left == right)
10137 result = 0;
10138 else
10139 result = unicode_compare((PyUnicodeObject *)left,
10140 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010142 /* Convert the return value to a Boolean */
10143 switch (op) {
10144 case Py_EQ:
10145 v = TEST_COND(result == 0);
10146 break;
10147 case Py_NE:
10148 v = TEST_COND(result != 0);
10149 break;
10150 case Py_LE:
10151 v = TEST_COND(result <= 0);
10152 break;
10153 case Py_GE:
10154 v = TEST_COND(result >= 0);
10155 break;
10156 case Py_LT:
10157 v = TEST_COND(result == -1);
10158 break;
10159 case Py_GT:
10160 v = TEST_COND(result == 1);
10161 break;
10162 default:
10163 PyErr_BadArgument();
10164 return NULL;
10165 }
10166 Py_INCREF(v);
10167 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169
Brian Curtindfc80e32011-08-10 20:28:54 -050010170 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010171}
10172
Alexander Belopolsky40018472011-02-26 01:02:56 +000010173int
10174PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010175{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010176 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 int kind1, kind2, kind;
10178 void *buf1, *buf2;
10179 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010180 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010181
10182 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010183 sub = PyUnicode_FromObject(element);
10184 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 PyErr_Format(PyExc_TypeError,
10186 "'in <string>' requires string as left operand, not %s",
10187 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010188 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (PyUnicode_READY(sub) == -1)
10191 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010192
Thomas Wouters477c8d52006-05-27 19:21:47 +000010193 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010194 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 Py_DECREF(sub);
10196 return -1;
10197 }
10198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 kind1 = PyUnicode_KIND(str);
10200 kind2 = PyUnicode_KIND(sub);
10201 kind = kind1 > kind2 ? kind1 : kind2;
10202 buf1 = PyUnicode_DATA(str);
10203 buf2 = PyUnicode_DATA(sub);
10204 if (kind1 != kind)
10205 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10206 if (!buf1) {
10207 Py_DECREF(sub);
10208 return -1;
10209 }
10210 if (kind2 != kind)
10211 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10212 if (!buf2) {
10213 Py_DECREF(sub);
10214 if (kind1 != kind) PyMem_Free(buf1);
10215 return -1;
10216 }
10217 len1 = PyUnicode_GET_LENGTH(str);
10218 len2 = PyUnicode_GET_LENGTH(sub);
10219
10220 switch(kind) {
10221 case PyUnicode_1BYTE_KIND:
10222 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10223 break;
10224 case PyUnicode_2BYTE_KIND:
10225 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10226 break;
10227 case PyUnicode_4BYTE_KIND:
10228 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10229 break;
10230 default:
10231 result = -1;
10232 assert(0);
10233 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234
10235 Py_DECREF(str);
10236 Py_DECREF(sub);
10237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 if (kind1 != kind)
10239 PyMem_Free(buf1);
10240 if (kind2 != kind)
10241 PyMem_Free(buf2);
10242
Guido van Rossum403d68b2000-03-13 15:55:09 +000010243 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010244}
10245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246/* Concat to string or Unicode object giving a new Unicode object. */
10247
Alexander Belopolsky40018472011-02-26 01:02:56 +000010248PyObject *
10249PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 PyObject *u = NULL, *v = NULL, *w;
10252 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
10254 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
10262 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010263 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010267 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270 }
10271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010273 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 w = PyUnicode_New(
10277 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10278 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010281 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10282 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 Py_DECREF(u);
10284 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010285 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289 Py_XDECREF(u);
10290 Py_XDECREF(v);
10291 return NULL;
10292}
10293
Victor Stinnerb0923652011-10-04 01:17:31 +020010294static void
10295unicode_append_inplace(PyObject **p_left, PyObject *right)
10296{
10297 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010298
10299 assert(PyUnicode_IS_READY(*p_left));
10300 assert(PyUnicode_IS_READY(right));
10301
10302 left_len = PyUnicode_GET_LENGTH(*p_left);
10303 right_len = PyUnicode_GET_LENGTH(right);
10304 if (left_len > PY_SSIZE_T_MAX - right_len) {
10305 PyErr_SetString(PyExc_OverflowError,
10306 "strings are too large to concat");
10307 goto error;
10308 }
10309 new_len = left_len + right_len;
10310
10311 /* Now we own the last reference to 'left', so we can resize it
10312 * in-place.
10313 */
10314 if (unicode_resize(p_left, new_len) != 0) {
10315 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10316 * deallocated so it cannot be put back into
10317 * 'variable'. The MemoryError is raised when there
10318 * is no value in 'variable', which might (very
10319 * remotely) be a cause of incompatibilities.
10320 */
10321 goto error;
10322 }
10323 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010324 copy_characters(*p_left, left_len, right, 0, right_len);
10325 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010326 return;
10327
10328error:
10329 Py_DECREF(*p_left);
10330 *p_left = NULL;
10331}
10332
Walter Dörwald1ab83302007-05-18 17:15:44 +000010333void
Victor Stinner23e56682011-10-03 03:54:37 +020010334PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010335{
Victor Stinner23e56682011-10-03 03:54:37 +020010336 PyObject *left, *res;
10337
10338 if (p_left == NULL) {
10339 if (!PyErr_Occurred())
10340 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010341 return;
10342 }
Victor Stinner23e56682011-10-03 03:54:37 +020010343 left = *p_left;
10344 if (right == NULL || !PyUnicode_Check(left)) {
10345 if (!PyErr_Occurred())
10346 PyErr_BadInternalCall();
10347 goto error;
10348 }
10349
Victor Stinnere1335c72011-10-04 20:53:03 +020010350 if (PyUnicode_READY(left))
10351 goto error;
10352 if (PyUnicode_READY(right))
10353 goto error;
10354
Victor Stinner23e56682011-10-03 03:54:37 +020010355 if (PyUnicode_CheckExact(left) && left != unicode_empty
10356 && PyUnicode_CheckExact(right) && right != unicode_empty
10357 && unicode_resizable(left)
10358 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10359 || _PyUnicode_WSTR(left) != NULL))
10360 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010361 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10362 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010363 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010364 not so different than duplicating the string. */
10365 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010366 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010367 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010368 if (p_left != NULL)
10369 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010370 return;
10371 }
10372 }
10373
10374 res = PyUnicode_Concat(left, right);
10375 if (res == NULL)
10376 goto error;
10377 Py_DECREF(left);
10378 *p_left = res;
10379 return;
10380
10381error:
10382 Py_DECREF(*p_left);
10383 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010384}
10385
10386void
10387PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010389 PyUnicode_Append(pleft, right);
10390 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010391}
10392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010393PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010396Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010397string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399
10400static PyObject *
10401unicode_count(PyUnicodeObject *self, PyObject *args)
10402{
10403 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010404 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010405 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 int kind1, kind2, kind;
10408 void *buf1, *buf2;
10409 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
Jesus Ceaac451502011-04-20 17:09:23 +020010411 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10412 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010413 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 kind1 = PyUnicode_KIND(self);
10416 kind2 = PyUnicode_KIND(substring);
10417 kind = kind1 > kind2 ? kind1 : kind2;
10418 buf1 = PyUnicode_DATA(self);
10419 buf2 = PyUnicode_DATA(substring);
10420 if (kind1 != kind)
10421 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10422 if (!buf1) {
10423 Py_DECREF(substring);
10424 return NULL;
10425 }
10426 if (kind2 != kind)
10427 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10428 if (!buf2) {
10429 Py_DECREF(substring);
10430 if (kind1 != kind) PyMem_Free(buf1);
10431 return NULL;
10432 }
10433 len1 = PyUnicode_GET_LENGTH(self);
10434 len2 = PyUnicode_GET_LENGTH(substring);
10435
10436 ADJUST_INDICES(start, end, len1);
10437 switch(kind) {
10438 case PyUnicode_1BYTE_KIND:
10439 iresult = ucs1lib_count(
10440 ((Py_UCS1*)buf1) + start, end - start,
10441 buf2, len2, PY_SSIZE_T_MAX
10442 );
10443 break;
10444 case PyUnicode_2BYTE_KIND:
10445 iresult = ucs2lib_count(
10446 ((Py_UCS2*)buf1) + start, end - start,
10447 buf2, len2, PY_SSIZE_T_MAX
10448 );
10449 break;
10450 case PyUnicode_4BYTE_KIND:
10451 iresult = ucs4lib_count(
10452 ((Py_UCS4*)buf1) + start, end - start,
10453 buf2, len2, PY_SSIZE_T_MAX
10454 );
10455 break;
10456 default:
10457 assert(0); iresult = 0;
10458 }
10459
10460 result = PyLong_FromSsize_t(iresult);
10461
10462 if (kind1 != kind)
10463 PyMem_Free(buf1);
10464 if (kind2 != kind)
10465 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466
10467 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469 return result;
10470}
10471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010472PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010473 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010475Encode S using the codec registered for encoding. Default encoding\n\
10476is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010477handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010478a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10479'xmlcharrefreplace' as well as any other name registered with\n\
10480codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481
10482static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010483unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010485 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486 char *encoding = NULL;
10487 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010488
Benjamin Peterson308d6372009-09-18 21:42:35 +000010489 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10490 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010492 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010493}
10494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010495PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497\n\
10498Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010499If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
10501static PyObject*
10502unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10503{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010504 Py_ssize_t i, j, line_pos, src_len, incr;
10505 Py_UCS4 ch;
10506 PyObject *u;
10507 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010509 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010510 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
10512 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514
Antoine Pitrou22425222011-10-04 19:10:51 +020010515 if (PyUnicode_READY(self) == -1)
10516 return NULL;
10517
Thomas Wouters7e474022000-07-16 12:04:32 +000010518 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010519 src_len = PyUnicode_GET_LENGTH(self);
10520 i = j = line_pos = 0;
10521 kind = PyUnicode_KIND(self);
10522 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010523 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010524 for (; i < src_len; i++) {
10525 ch = PyUnicode_READ(kind, src_data, i);
10526 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010527 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010529 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010531 goto overflow;
10532 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010534 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010538 goto overflow;
10539 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010541 if (ch == '\n' || ch == '\r')
10542 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010544 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010545 if (!found && PyUnicode_CheckExact(self)) {
10546 Py_INCREF((PyObject *) self);
10547 return (PyObject *) self;
10548 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010549
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010551 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 if (!u)
10553 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010554 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555
Antoine Pitroue71d5742011-10-04 15:55:09 +020010556 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557
Antoine Pitroue71d5742011-10-04 15:55:09 +020010558 for (; i < src_len; i++) {
10559 ch = PyUnicode_READ(kind, src_data, i);
10560 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010562 incr = tabsize - (line_pos % tabsize);
10563 line_pos += incr;
10564 while (incr--) {
10565 PyUnicode_WRITE(kind, dest_data, j, ' ');
10566 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010567 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010571 line_pos++;
10572 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010573 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010574 if (ch == '\n' || ch == '\r')
10575 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010577 }
10578 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010579#ifndef DONT_MAKE_RESULT_READY
10580 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 Py_DECREF(u);
10582 return NULL;
10583 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010584#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010585 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010587
Antoine Pitroue71d5742011-10-04 15:55:09 +020010588 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010589 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591}
10592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010593PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595\n\
10596Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010597such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598arguments start and end are interpreted as in slice notation.\n\
10599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010600Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
10602static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604{
Jesus Ceaac451502011-04-20 17:09:23 +020010605 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010606 Py_ssize_t start;
10607 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
Jesus Ceaac451502011-04-20 17:09:23 +020010610 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10611 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (PyUnicode_READY(self) == -1)
10615 return NULL;
10616 if (PyUnicode_READY(substring) == -1)
10617 return NULL;
10618
Victor Stinner794d5672011-10-10 03:21:36 +020010619 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622
10623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (result == -2)
10626 return NULL;
10627
Christian Heimes217cfd12007-12-02 14:31:20 +000010628 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629}
10630
10631static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010632unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010634 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10635 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638}
10639
Guido van Rossumc2504932007-09-18 19:42:40 +000010640/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010641 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010642static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010643unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644{
Guido van Rossumc2504932007-09-18 19:42:40 +000010645 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010646 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (_PyUnicode_HASH(self) != -1)
10649 return _PyUnicode_HASH(self);
10650 if (PyUnicode_READY(self) == -1)
10651 return -1;
10652 len = PyUnicode_GET_LENGTH(self);
10653
10654 /* The hash function as a macro, gets expanded three times below. */
10655#define HASH(P) \
10656 x = (Py_uhash_t)*P << 7; \
10657 while (--len >= 0) \
10658 x = (1000003*x) ^ (Py_uhash_t)*P++;
10659
10660 switch (PyUnicode_KIND(self)) {
10661 case PyUnicode_1BYTE_KIND: {
10662 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10663 HASH(c);
10664 break;
10665 }
10666 case PyUnicode_2BYTE_KIND: {
10667 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10668 HASH(s);
10669 break;
10670 }
10671 default: {
10672 Py_UCS4 *l;
10673 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10674 "Impossible switch case in unicode_hash");
10675 l = PyUnicode_4BYTE_DATA(self);
10676 HASH(l);
10677 break;
10678 }
10679 }
10680 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10681
Guido van Rossumc2504932007-09-18 19:42:40 +000010682 if (x == -1)
10683 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010685 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010692Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
10694static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010697 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010698 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010699 Py_ssize_t start;
10700 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701
Jesus Ceaac451502011-04-20 17:09:23 +020010702 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10703 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 if (PyUnicode_READY(self) == -1)
10707 return NULL;
10708 if (PyUnicode_READY(substring) == -1)
10709 return NULL;
10710
Victor Stinner794d5672011-10-10 03:21:36 +020010711 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010713 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (result == -2)
10718 return NULL;
10719
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 if (result < 0) {
10721 PyErr_SetString(PyExc_ValueError, "substring not found");
10722 return NULL;
10723 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724
Christian Heimes217cfd12007-12-02 14:31:20 +000010725 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726}
10727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010731Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010732at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
10734static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010735unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 Py_ssize_t i, length;
10738 int kind;
10739 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 int cased;
10741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 if (PyUnicode_READY(self) == -1)
10743 return NULL;
10744 length = PyUnicode_GET_LENGTH(self);
10745 kind = PyUnicode_KIND(self);
10746 data = PyUnicode_DATA(self);
10747
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (length == 1)
10750 return PyBool_FromLong(
10751 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010753 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010756
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 for (i = 0; i < length; i++) {
10759 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010760
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10762 return PyBool_FromLong(0);
10763 else if (!cased && Py_UNICODE_ISLOWER(ch))
10764 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010766 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767}
10768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010769PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010770 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010772Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010773at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
10775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010776unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 Py_ssize_t i, length;
10779 int kind;
10780 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 int cased;
10782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (PyUnicode_READY(self) == -1)
10784 return NULL;
10785 length = PyUnicode_GET_LENGTH(self);
10786 kind = PyUnicode_KIND(self);
10787 data = PyUnicode_DATA(self);
10788
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (length == 1)
10791 return PyBool_FromLong(
10792 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010794 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010797
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 for (i = 0; i < length; i++) {
10800 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010801
Benjamin Peterson29060642009-01-31 22:14:21 +000010802 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10803 return PyBool_FromLong(0);
10804 else if (!cased && Py_UNICODE_ISUPPER(ch))
10805 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010807 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808}
10809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010810PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010813Return True if S is a titlecased string and there is at least one\n\
10814character in S, i.e. upper- and titlecase characters may only\n\
10815follow uncased characters and lowercase characters only cased ones.\n\
10816Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817
10818static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010819unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 Py_ssize_t i, length;
10822 int kind;
10823 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824 int cased, previous_is_cased;
10825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 if (PyUnicode_READY(self) == -1)
10827 return NULL;
10828 length = PyUnicode_GET_LENGTH(self);
10829 kind = PyUnicode_KIND(self);
10830 data = PyUnicode_DATA(self);
10831
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (length == 1) {
10834 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10835 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10836 (Py_UNICODE_ISUPPER(ch) != 0));
10837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010839 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010842
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 cased = 0;
10844 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 for (i = 0; i < length; i++) {
10846 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010847
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10849 if (previous_is_cased)
10850 return PyBool_FromLong(0);
10851 previous_is_cased = 1;
10852 cased = 1;
10853 }
10854 else if (Py_UNICODE_ISLOWER(ch)) {
10855 if (!previous_is_cased)
10856 return PyBool_FromLong(0);
10857 previous_is_cased = 1;
10858 cased = 1;
10859 }
10860 else
10861 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010863 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864}
10865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010866PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010869Return True if all characters in S are whitespace\n\
10870and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
10872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010873unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 Py_ssize_t i, length;
10876 int kind;
10877 void *data;
10878
10879 if (PyUnicode_READY(self) == -1)
10880 return NULL;
10881 length = PyUnicode_GET_LENGTH(self);
10882 kind = PyUnicode_KIND(self);
10883 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (length == 1)
10887 return PyBool_FromLong(
10888 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010890 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 for (i = 0; i < length; i++) {
10895 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010896 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900}
10901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010902PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010905Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010906and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010907
10908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010909unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 Py_ssize_t i, length;
10912 int kind;
10913 void *data;
10914
10915 if (PyUnicode_READY(self) == -1)
10916 return NULL;
10917 length = PyUnicode_GET_LENGTH(self);
10918 kind = PyUnicode_KIND(self);
10919 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010920
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010921 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 if (length == 1)
10923 return PyBool_FromLong(
10924 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010925
10926 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 for (i = 0; i < length; i++) {
10931 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010933 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010934 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010935}
10936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010937PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010939\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010940Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010942
10943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010944unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 int kind;
10947 void *data;
10948 Py_ssize_t len, i;
10949
10950 if (PyUnicode_READY(self) == -1)
10951 return NULL;
10952
10953 kind = PyUnicode_KIND(self);
10954 data = PyUnicode_DATA(self);
10955 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010956
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (len == 1) {
10959 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10960 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10961 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010962
10963 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010965 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 for (i = 0; i < len; i++) {
10968 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010969 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010970 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010971 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010972 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010973}
10974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010975PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010978Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
10981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010982unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 Py_ssize_t i, length;
10985 int kind;
10986 void *data;
10987
10988 if (PyUnicode_READY(self) == -1)
10989 return NULL;
10990 length = PyUnicode_GET_LENGTH(self);
10991 kind = PyUnicode_KIND(self);
10992 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 if (length == 1)
10996 return PyBool_FromLong(
10997 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010999 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 for (i = 0; i < length; i++) {
11004 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011007 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008}
11009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011010PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011013Return True if all characters in S are digits\n\
11014and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011017unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_ssize_t i, length;
11020 int kind;
11021 void *data;
11022
11023 if (PyUnicode_READY(self) == -1)
11024 return NULL;
11025 length = PyUnicode_GET_LENGTH(self);
11026 kind = PyUnicode_KIND(self);
11027 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 if (length == 1) {
11031 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11032 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011035 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 for (i = 0; i < length; i++) {
11040 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011041 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011043 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044}
11045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011046PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011049Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011050False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
11052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011053unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 Py_ssize_t i, length;
11056 int kind;
11057 void *data;
11058
11059 if (PyUnicode_READY(self) == -1)
11060 return NULL;
11061 length = PyUnicode_GET_LENGTH(self);
11062 kind = PyUnicode_KIND(self);
11063 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (length == 1)
11067 return PyBool_FromLong(
11068 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011070 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 for (i = 0; i < length; i++) {
11075 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011078 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079}
11080
Martin v. Löwis47383402007-08-15 07:32:56 +000011081int
11082PyUnicode_IsIdentifier(PyObject *self)
11083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 int kind;
11085 void *data;
11086 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011087 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (PyUnicode_READY(self) == -1) {
11090 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 }
11093
11094 /* Special case for empty strings */
11095 if (PyUnicode_GET_LENGTH(self) == 0)
11096 return 0;
11097 kind = PyUnicode_KIND(self);
11098 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011099
11100 /* PEP 3131 says that the first character must be in
11101 XID_Start and subsequent characters in XID_Continue,
11102 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011103 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011104 letters, digits, underscore). However, given the current
11105 definition of XID_Start and XID_Continue, it is sufficient
11106 to check just for these, except that _ must be allowed
11107 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011109 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011110 return 0;
11111
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011112 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011115 return 1;
11116}
11117
11118PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011120\n\
11121Return True if S is a valid identifier according\n\
11122to the language definition.");
11123
11124static PyObject*
11125unicode_isidentifier(PyObject *self)
11126{
11127 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11128}
11129
Georg Brandl559e5d72008-06-11 18:37:52 +000011130PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011132\n\
11133Return True if all characters in S are considered\n\
11134printable in repr() or S is empty, False otherwise.");
11135
11136static PyObject*
11137unicode_isprintable(PyObject *self)
11138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 Py_ssize_t i, length;
11140 int kind;
11141 void *data;
11142
11143 if (PyUnicode_READY(self) == -1)
11144 return NULL;
11145 length = PyUnicode_GET_LENGTH(self);
11146 kind = PyUnicode_KIND(self);
11147 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011148
11149 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (length == 1)
11151 return PyBool_FromLong(
11152 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 for (i = 0; i < length; i++) {
11155 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011156 Py_RETURN_FALSE;
11157 }
11158 }
11159 Py_RETURN_TRUE;
11160}
11161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011162PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011163 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164\n\
11165Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011166iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
11168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011169unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011171 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172}
11173
Martin v. Löwis18e16552006-02-15 17:27:45 +000011174static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175unicode_length(PyUnicodeObject *self)
11176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (PyUnicode_READY(self) == -1)
11178 return -1;
11179 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180}
11181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011182PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011185Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011186done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011189unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011191 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 Py_UCS4 fillchar = ' ';
11193
11194 if (PyUnicode_READY(self) == -1)
11195 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011196
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011197 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 return NULL;
11199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 Py_INCREF(self);
11202 return (PyObject*) self;
11203 }
11204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206}
11207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011208PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011214unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216 return fixup(self, fixlower);
11217}
11218
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011219#define LEFTSTRIP 0
11220#define RIGHTSTRIP 1
11221#define BOTHSTRIP 2
11222
11223/* Arrays indexed by above */
11224static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11225
11226#define STRIPNAME(i) (stripformat[i]+3)
11227
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011228/* externally visible for str.strip(unicode) */
11229PyObject *
11230_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 void *data;
11233 int kind;
11234 Py_ssize_t i, j, len;
11235 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11238 return NULL;
11239
11240 kind = PyUnicode_KIND(self);
11241 data = PyUnicode_DATA(self);
11242 len = PyUnicode_GET_LENGTH(self);
11243 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11244 PyUnicode_DATA(sepobj),
11245 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011246
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 i = 0;
11248 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 while (i < len &&
11250 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 i++;
11252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011253 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011254
Benjamin Peterson14339b62009-01-31 16:36:08 +000011255 j = len;
11256 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 do {
11258 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 } while (j >= i &&
11260 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011262 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011263
Victor Stinner12bab6d2011-10-01 01:53:49 +020011264 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265}
11266
11267PyObject*
11268PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11269{
11270 unsigned char *data;
11271 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011272 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273
Victor Stinnerde636f32011-10-01 03:55:54 +020011274 if (PyUnicode_READY(self) == -1)
11275 return NULL;
11276
11277 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11278
Victor Stinner12bab6d2011-10-01 01:53:49 +020011279 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011281 if (PyUnicode_CheckExact(self)) {
11282 Py_INCREF(self);
11283 return self;
11284 }
11285 else
11286 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 }
11288
Victor Stinner12bab6d2011-10-01 01:53:49 +020011289 length = end - start;
11290 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011291 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292
Victor Stinnerde636f32011-10-01 03:55:54 +020011293 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011294 PyErr_SetString(PyExc_IndexError, "string index out of range");
11295 return NULL;
11296 }
11297
Victor Stinnerb9275c12011-10-05 14:01:42 +020011298 if (PyUnicode_IS_ASCII(self)) {
11299 kind = PyUnicode_KIND(self);
11300 data = PyUnicode_1BYTE_DATA(self);
11301 return unicode_fromascii(data + start, length);
11302 }
11303 else {
11304 kind = PyUnicode_KIND(self);
11305 data = PyUnicode_1BYTE_DATA(self);
11306 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011307 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011308 length);
11309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
11312static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011313do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 int kind;
11316 void *data;
11317 Py_ssize_t len, i, j;
11318
11319 if (PyUnicode_READY(self) == -1)
11320 return NULL;
11321
11322 kind = PyUnicode_KIND(self);
11323 data = PyUnicode_DATA(self);
11324 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011325
Benjamin Peterson14339b62009-01-31 16:36:08 +000011326 i = 0;
11327 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 i++;
11330 }
11331 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011332
Benjamin Peterson14339b62009-01-31 16:36:08 +000011333 j = len;
11334 if (striptype != LEFTSTRIP) {
11335 do {
11336 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011338 j++;
11339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011340
Victor Stinner12bab6d2011-10-01 01:53:49 +020011341 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011344
11345static PyObject *
11346do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011348 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011349
Benjamin Peterson14339b62009-01-31 16:36:08 +000011350 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11351 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011352
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 if (sep != NULL && sep != Py_None) {
11354 if (PyUnicode_Check(sep))
11355 return _PyUnicode_XStrip(self, striptype, sep);
11356 else {
11357 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 "%s arg must be None or str",
11359 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011360 return NULL;
11361 }
11362 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011363
Benjamin Peterson14339b62009-01-31 16:36:08 +000011364 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011365}
11366
11367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011370\n\
11371Return a copy of the string S with leading and trailing\n\
11372whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011373If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011374
11375static PyObject *
11376unicode_strip(PyUnicodeObject *self, PyObject *args)
11377{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011378 if (PyTuple_GET_SIZE(args) == 0)
11379 return do_strip(self, BOTHSTRIP); /* Common case */
11380 else
11381 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011382}
11383
11384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011387\n\
11388Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011389If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011390
11391static PyObject *
11392unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11393{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394 if (PyTuple_GET_SIZE(args) == 0)
11395 return do_strip(self, LEFTSTRIP); /* Common case */
11396 else
11397 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011398}
11399
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011403\n\
11404Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011405If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011406
11407static PyObject *
11408unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011410 if (PyTuple_GET_SIZE(args) == 0)
11411 return do_strip(self, RIGHTSTRIP); /* Common case */
11412 else
11413 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011414}
11415
11416
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011418unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419{
11420 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
Georg Brandl222de0f2009-04-12 12:01:50 +000011423 if (len < 1) {
11424 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011425 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Tim Peters7a29bd52001-09-12 03:03:31 +000011428 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 /* no repeat, return original string */
11430 Py_INCREF(str);
11431 return (PyObject*) str;
11432 }
Tim Peters8f422462000-09-09 06:13:41 +000011433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (PyUnicode_READY(str) == -1)
11435 return NULL;
11436
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011437 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011438 PyErr_SetString(PyExc_OverflowError,
11439 "repeated string is too long");
11440 return NULL;
11441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 if (!u)
11446 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011447 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (PyUnicode_GET_LENGTH(str) == 1) {
11450 const int kind = PyUnicode_KIND(str);
11451 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11452 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011453 if (kind == PyUnicode_1BYTE_KIND)
11454 memset(to, (unsigned char)fill_char, len);
11455 else {
11456 for (n = 0; n < len; ++n)
11457 PyUnicode_WRITE(kind, to, n, fill_char);
11458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 }
11460 else {
11461 /* number of characters copied this far */
11462 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011463 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 char *to = (char *) PyUnicode_DATA(u);
11465 Py_MEMCPY(to, PyUnicode_DATA(str),
11466 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 n = (done <= nchars-done) ? done : nchars-done;
11469 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011470 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 }
11473
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011474 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 return (PyObject*) u;
11476}
11477
Alexander Belopolsky40018472011-02-26 01:02:56 +000011478PyObject *
11479PyUnicode_Replace(PyObject *obj,
11480 PyObject *subobj,
11481 PyObject *replobj,
11482 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483{
11484 PyObject *self;
11485 PyObject *str1;
11486 PyObject *str2;
11487 PyObject *result;
11488
11489 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011490 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011493 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 Py_DECREF(self);
11495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 }
11497 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011498 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 Py_DECREF(self);
11500 Py_DECREF(str1);
11501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 Py_DECREF(self);
11505 Py_DECREF(str1);
11506 Py_DECREF(str2);
11507 return result;
11508}
11509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011510PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011511 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512\n\
11513Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011514old replaced by new. If the optional argument count is\n\
11515given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 PyObject *str1;
11521 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011522 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 PyObject *result;
11524
Martin v. Löwis18e16552006-02-15 17:27:45 +000011525 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 str1 = PyUnicode_FromObject(str1);
11530 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11531 return NULL;
11532 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011533 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 Py_DECREF(str1);
11535 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538 result = replace(self, str1, str2, maxcount);
11539
11540 Py_DECREF(str1);
11541 Py_DECREF(str2);
11542 return result;
11543}
11544
Alexander Belopolsky40018472011-02-26 01:02:56 +000011545static PyObject *
11546unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011548 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 Py_ssize_t isize;
11550 Py_ssize_t osize, squote, dquote, i, o;
11551 Py_UCS4 max, quote;
11552 int ikind, okind;
11553 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011556 return NULL;
11557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 isize = PyUnicode_GET_LENGTH(unicode);
11559 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 /* Compute length of output, quote characters, and
11562 maximum character */
11563 osize = 2; /* quotes */
11564 max = 127;
11565 squote = dquote = 0;
11566 ikind = PyUnicode_KIND(unicode);
11567 for (i = 0; i < isize; i++) {
11568 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11569 switch (ch) {
11570 case '\'': squote++; osize++; break;
11571 case '"': dquote++; osize++; break;
11572 case '\\': case '\t': case '\r': case '\n':
11573 osize += 2; break;
11574 default:
11575 /* Fast-path ASCII */
11576 if (ch < ' ' || ch == 0x7f)
11577 osize += 4; /* \xHH */
11578 else if (ch < 0x7f)
11579 osize++;
11580 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11581 osize++;
11582 max = ch > max ? ch : max;
11583 }
11584 else if (ch < 0x100)
11585 osize += 4; /* \xHH */
11586 else if (ch < 0x10000)
11587 osize += 6; /* \uHHHH */
11588 else
11589 osize += 10; /* \uHHHHHHHH */
11590 }
11591 }
11592
11593 quote = '\'';
11594 if (squote) {
11595 if (dquote)
11596 /* Both squote and dquote present. Use squote,
11597 and escape them */
11598 osize += squote;
11599 else
11600 quote = '"';
11601 }
11602
11603 repr = PyUnicode_New(osize, max);
11604 if (repr == NULL)
11605 return NULL;
11606 okind = PyUnicode_KIND(repr);
11607 odata = PyUnicode_DATA(repr);
11608
11609 PyUnicode_WRITE(okind, odata, 0, quote);
11610 PyUnicode_WRITE(okind, odata, osize-1, quote);
11611
11612 for (i = 0, o = 1; i < isize; i++) {
11613 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011614
11615 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if ((ch == quote) || (ch == '\\')) {
11617 PyUnicode_WRITE(okind, odata, o++, '\\');
11618 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011619 continue;
11620 }
11621
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011623 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 PyUnicode_WRITE(okind, odata, o++, '\\');
11625 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011626 }
11627 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 PyUnicode_WRITE(okind, odata, o++, '\\');
11629 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011630 }
11631 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 PyUnicode_WRITE(okind, odata, o++, '\\');
11633 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011634 }
11635
11636 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011637 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 PyUnicode_WRITE(okind, odata, o++, '\\');
11639 PyUnicode_WRITE(okind, odata, o++, 'x');
11640 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11641 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011642 }
11643
Georg Brandl559e5d72008-06-11 18:37:52 +000011644 /* Copy ASCII characters as-is */
11645 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011647 }
11648
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011650 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011652 (categories Z* and C* except ASCII space)
11653 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011655 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (ch <= 0xff) {
11657 PyUnicode_WRITE(okind, odata, o++, '\\');
11658 PyUnicode_WRITE(okind, odata, o++, 'x');
11659 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11660 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011661 }
11662 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 else if (ch >= 0x10000) {
11664 PyUnicode_WRITE(okind, odata, o++, '\\');
11665 PyUnicode_WRITE(okind, odata, o++, 'U');
11666 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11667 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11668 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11669 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11670 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11671 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11672 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11673 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011674 }
11675 /* Map 16-bit characters to '\uxxxx' */
11676 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 PyUnicode_WRITE(okind, odata, o++, '\\');
11678 PyUnicode_WRITE(okind, odata, o++, 'u');
11679 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11680 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11681 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11682 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011683 }
11684 }
11685 /* Copy characters as-is */
11686 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011688 }
11689 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011692 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011693 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698\n\
11699Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011700such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701arguments start and end are interpreted as in slice notation.\n\
11702\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011703Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
11705static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Jesus Ceaac451502011-04-20 17:09:23 +020011708 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011709 Py_ssize_t start;
11710 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011711 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Jesus Ceaac451502011-04-20 17:09:23 +020011713 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11714 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (PyUnicode_READY(self) == -1)
11718 return NULL;
11719 if (PyUnicode_READY(substring) == -1)
11720 return NULL;
11721
Victor Stinner794d5672011-10-10 03:21:36 +020011722 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011724 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
11726 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (result == -2)
11729 return NULL;
11730
Christian Heimes217cfd12007-12-02 14:31:20 +000011731 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732}
11733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011734PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011735 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Jesus Ceaac451502011-04-20 17:09:23 +020011742 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011743 Py_ssize_t start;
11744 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011745 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Jesus Ceaac451502011-04-20 17:09:23 +020011747 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11748 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (PyUnicode_READY(self) == -1)
11752 return NULL;
11753 if (PyUnicode_READY(substring) == -1)
11754 return NULL;
11755
Victor Stinner794d5672011-10-10 03:21:36 +020011756 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011758 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
11760 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (result == -2)
11763 return NULL;
11764
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 if (result < 0) {
11766 PyErr_SetString(PyExc_ValueError, "substring not found");
11767 return NULL;
11768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769
Christian Heimes217cfd12007-12-02 14:31:20 +000011770 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011776Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011777done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
11779static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011780unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011782 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 Py_UCS4 fillchar = ' ';
11784
Victor Stinnere9a29352011-10-01 02:14:59 +020011785 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011787
Victor Stinnere9a29352011-10-01 02:14:59 +020011788 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 return NULL;
11790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 Py_INCREF(self);
11793 return (PyObject*) self;
11794 }
11795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Alexander Belopolsky40018472011-02-26 01:02:56 +000011799PyObject *
11800PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
11802 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 s = PyUnicode_FromObject(s);
11805 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011806 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 if (sep != NULL) {
11808 sep = PyUnicode_FromObject(sep);
11809 if (sep == NULL) {
11810 Py_DECREF(s);
11811 return NULL;
11812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 }
11814
Victor Stinner9310abb2011-10-05 00:59:23 +020011815 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
11817 Py_DECREF(s);
11818 Py_XDECREF(sep);
11819 return result;
11820}
11821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011822PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824\n\
11825Return a list of the words in S, using sep as the\n\
11826delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011827splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011828whitespace string is a separator and empty strings are\n\
11829removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
11831static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011832unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833{
11834 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011835 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
Martin v. Löwis18e16552006-02-15 17:27:45 +000011837 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 return NULL;
11839
11840 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011843 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846}
11847
Thomas Wouters477c8d52006-05-27 19:21:47 +000011848PyObject *
11849PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11850{
11851 PyObject* str_obj;
11852 PyObject* sep_obj;
11853 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 int kind1, kind2, kind;
11855 void *buf1 = NULL, *buf2 = NULL;
11856 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011857
11858 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011859 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011861 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011863 Py_DECREF(str_obj);
11864 return NULL;
11865 }
11866
Victor Stinner14f8f022011-10-05 20:58:25 +020011867 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011869 kind = Py_MAX(kind1, kind2);
11870 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011872 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (!buf1)
11874 goto onError;
11875 buf2 = PyUnicode_DATA(sep_obj);
11876 if (kind2 != kind)
11877 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11878 if (!buf2)
11879 goto onError;
11880 len1 = PyUnicode_GET_LENGTH(str_obj);
11881 len2 = PyUnicode_GET_LENGTH(sep_obj);
11882
Victor Stinner14f8f022011-10-05 20:58:25 +020011883 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011885 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11886 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11887 else
11888 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 break;
11890 case PyUnicode_2BYTE_KIND:
11891 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11892 break;
11893 case PyUnicode_4BYTE_KIND:
11894 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11895 break;
11896 default:
11897 assert(0);
11898 out = 0;
11899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011900
11901 Py_DECREF(sep_obj);
11902 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 if (kind1 != kind)
11904 PyMem_Free(buf1);
11905 if (kind2 != kind)
11906 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011907
11908 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 onError:
11910 Py_DECREF(sep_obj);
11911 Py_DECREF(str_obj);
11912 if (kind1 != kind && buf1)
11913 PyMem_Free(buf1);
11914 if (kind2 != kind && buf2)
11915 PyMem_Free(buf2);
11916 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917}
11918
11919
11920PyObject *
11921PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11922{
11923 PyObject* str_obj;
11924 PyObject* sep_obj;
11925 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 int kind1, kind2, kind;
11927 void *buf1 = NULL, *buf2 = NULL;
11928 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011929
11930 str_obj = PyUnicode_FromObject(str_in);
11931 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011933 sep_obj = PyUnicode_FromObject(sep_in);
11934 if (!sep_obj) {
11935 Py_DECREF(str_obj);
11936 return NULL;
11937 }
11938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 kind1 = PyUnicode_KIND(str_in);
11940 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011941 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 buf1 = PyUnicode_DATA(str_in);
11943 if (kind1 != kind)
11944 buf1 = _PyUnicode_AsKind(str_in, kind);
11945 if (!buf1)
11946 goto onError;
11947 buf2 = PyUnicode_DATA(sep_obj);
11948 if (kind2 != kind)
11949 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11950 if (!buf2)
11951 goto onError;
11952 len1 = PyUnicode_GET_LENGTH(str_obj);
11953 len2 = PyUnicode_GET_LENGTH(sep_obj);
11954
11955 switch(PyUnicode_KIND(str_in)) {
11956 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011957 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11958 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11959 else
11960 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 break;
11962 case PyUnicode_2BYTE_KIND:
11963 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11964 break;
11965 case PyUnicode_4BYTE_KIND:
11966 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11967 break;
11968 default:
11969 assert(0);
11970 out = 0;
11971 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011972
11973 Py_DECREF(sep_obj);
11974 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (kind1 != kind)
11976 PyMem_Free(buf1);
11977 if (kind2 != kind)
11978 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011979
11980 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 onError:
11982 Py_DECREF(sep_obj);
11983 Py_DECREF(str_obj);
11984 if (kind1 != kind && buf1)
11985 PyMem_Free(buf1);
11986 if (kind2 != kind && buf2)
11987 PyMem_Free(buf2);
11988 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989}
11990
11991PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011993\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011994Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011995the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011996found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011997
11998static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011999unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000{
Victor Stinner9310abb2011-10-05 00:59:23 +020012001 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012002}
12003
12004PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012005 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012006\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012007Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012008the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012009separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010
12011static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012012unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013{
Victor Stinner9310abb2011-10-05 00:59:23 +020012014 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012015}
12016
Alexander Belopolsky40018472011-02-26 01:02:56 +000012017PyObject *
12018PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012019{
12020 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012021
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012022 s = PyUnicode_FromObject(s);
12023 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012024 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 if (sep != NULL) {
12026 sep = PyUnicode_FromObject(sep);
12027 if (sep == NULL) {
12028 Py_DECREF(s);
12029 return NULL;
12030 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012031 }
12032
Victor Stinner9310abb2011-10-05 00:59:23 +020012033 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012034
12035 Py_DECREF(s);
12036 Py_XDECREF(sep);
12037 return result;
12038}
12039
12040PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012042\n\
12043Return a list of the words in S, using sep as the\n\
12044delimiter string, starting at the end of the string and\n\
12045working to the front. If maxsplit is given, at most maxsplit\n\
12046splits are done. If sep is not specified, any whitespace string\n\
12047is a separator.");
12048
12049static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012050unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012051{
12052 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012053 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012054
Martin v. Löwis18e16552006-02-15 17:27:45 +000012055 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012056 return NULL;
12057
12058 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012060 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012061 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012062 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012063 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012064}
12065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012066PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068\n\
12069Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012070Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012071is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
12073static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012074unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012076 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012077 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012079 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12080 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 return NULL;
12082
Guido van Rossum86662912000-04-11 15:38:46 +000012083 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084}
12085
12086static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012087PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Walter Dörwald346737f2007-05-31 10:44:43 +000012089 if (PyUnicode_CheckExact(self)) {
12090 Py_INCREF(self);
12091 return self;
12092 } else
12093 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012094 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012097PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099\n\
12100Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012101and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012104unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106 return fixup(self, fixswapcase);
12107}
12108
Georg Brandlceee0772007-11-27 23:48:05 +000012109PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012111\n\
12112Return a translation table usable for str.translate().\n\
12113If there is only one argument, it must be a dictionary mapping Unicode\n\
12114ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012115Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012116If there are two arguments, they must be strings of equal length, and\n\
12117in the resulting dictionary, each character in x will be mapped to the\n\
12118character at the same position in y. If there is a third argument, it\n\
12119must be a string, whose characters will be mapped to None in the result.");
12120
12121static PyObject*
12122unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12123{
12124 PyObject *x, *y = NULL, *z = NULL;
12125 PyObject *new = NULL, *key, *value;
12126 Py_ssize_t i = 0;
12127 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012128
Georg Brandlceee0772007-11-27 23:48:05 +000012129 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12130 return NULL;
12131 new = PyDict_New();
12132 if (!new)
12133 return NULL;
12134 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 int x_kind, y_kind, z_kind;
12136 void *x_data, *y_data, *z_data;
12137
Georg Brandlceee0772007-11-27 23:48:05 +000012138 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012139 if (!PyUnicode_Check(x)) {
12140 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12141 "be a string if there is a second argument");
12142 goto err;
12143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012145 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12146 "arguments must have equal length");
12147 goto err;
12148 }
12149 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 x_kind = PyUnicode_KIND(x);
12151 y_kind = PyUnicode_KIND(y);
12152 x_data = PyUnicode_DATA(x);
12153 y_data = PyUnicode_DATA(y);
12154 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12155 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12156 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012157 if (!key || !value)
12158 goto err;
12159 res = PyDict_SetItem(new, key, value);
12160 Py_DECREF(key);
12161 Py_DECREF(value);
12162 if (res < 0)
12163 goto err;
12164 }
12165 /* create entries for deleting chars in z */
12166 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 z_kind = PyUnicode_KIND(z);
12168 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012169 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012171 if (!key)
12172 goto err;
12173 res = PyDict_SetItem(new, key, Py_None);
12174 Py_DECREF(key);
12175 if (res < 0)
12176 goto err;
12177 }
12178 }
12179 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 int kind;
12181 void *data;
12182
Georg Brandlceee0772007-11-27 23:48:05 +000012183 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012184 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012185 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12186 "to maketrans it must be a dict");
12187 goto err;
12188 }
12189 /* copy entries into the new dict, converting string keys to int keys */
12190 while (PyDict_Next(x, &i, &key, &value)) {
12191 if (PyUnicode_Check(key)) {
12192 /* convert string keys to integer keys */
12193 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012194 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012195 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12196 "table must be of length 1");
12197 goto err;
12198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 kind = PyUnicode_KIND(key);
12200 data = PyUnicode_DATA(key);
12201 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012202 if (!newkey)
12203 goto err;
12204 res = PyDict_SetItem(new, newkey, value);
12205 Py_DECREF(newkey);
12206 if (res < 0)
12207 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012208 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012209 /* just keep integer keys */
12210 if (PyDict_SetItem(new, key, value) < 0)
12211 goto err;
12212 } else {
12213 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12214 "be strings or integers");
12215 goto err;
12216 }
12217 }
12218 }
12219 return new;
12220 err:
12221 Py_DECREF(new);
12222 return NULL;
12223}
12224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012225PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227\n\
12228Return a copy of the string S, where all characters have been mapped\n\
12229through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012230Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012231Unmapped characters are left untouched. Characters mapped to None\n\
12232are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
12234static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012240PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012243Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
12245static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012246unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 return fixup(self, fixupper);
12249}
12250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012254Pad a numeric string S with zeros on the left, to fill a field\n\
12255of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
12257static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012258unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012260 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012261 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012262 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 int kind;
12264 void *data;
12265 Py_UCS4 chr;
12266
12267 if (PyUnicode_READY(self) == -1)
12268 return NULL;
12269
Martin v. Löwis18e16552006-02-15 17:27:45 +000012270 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 return NULL;
12272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012274 if (PyUnicode_CheckExact(self)) {
12275 Py_INCREF(self);
12276 return (PyObject*) self;
12277 }
12278 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012279 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 }
12281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
12284 u = pad(self, fill, 0, '0');
12285
Walter Dörwald068325e2002-04-15 13:36:47 +000012286 if (u == NULL)
12287 return NULL;
12288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 kind = PyUnicode_KIND(u);
12290 data = PyUnicode_DATA(u);
12291 chr = PyUnicode_READ(kind, data, fill);
12292
12293 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 PyUnicode_WRITE(kind, data, 0, chr);
12296 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 }
12298
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012299 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 return (PyObject*) u;
12301}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
12303#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012304static PyObject *
12305unicode__decimal2ascii(PyObject *self)
12306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012308}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309#endif
12310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012311PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012314Return True if S starts with the specified prefix, False otherwise.\n\
12315With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012316With optional end, stop comparing S at that position.\n\
12317prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318
12319static PyObject *
12320unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012323 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012325 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012326 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012327 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Jesus Ceaac451502011-04-20 17:09:23 +020012329 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012331 if (PyTuple_Check(subobj)) {
12332 Py_ssize_t i;
12333 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12334 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012336 if (substring == NULL)
12337 return NULL;
12338 result = tailmatch(self, substring, start, end, -1);
12339 Py_DECREF(substring);
12340 if (result) {
12341 Py_RETURN_TRUE;
12342 }
12343 }
12344 /* nothing matched */
12345 Py_RETURN_FALSE;
12346 }
12347 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012348 if (substring == NULL) {
12349 if (PyErr_ExceptionMatches(PyExc_TypeError))
12350 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12351 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012353 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012354 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012356 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357}
12358
12359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012360PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012363Return True if S ends with the specified suffix, False otherwise.\n\
12364With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012365With optional end, stop comparing S at that position.\n\
12366suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367
12368static PyObject *
12369unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012372 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012374 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012375 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012376 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Jesus Ceaac451502011-04-20 17:09:23 +020012378 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012380 if (PyTuple_Check(subobj)) {
12381 Py_ssize_t i;
12382 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12383 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012385 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012387 result = tailmatch(self, substring, start, end, +1);
12388 Py_DECREF(substring);
12389 if (result) {
12390 Py_RETURN_TRUE;
12391 }
12392 }
12393 Py_RETURN_FALSE;
12394 }
12395 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012396 if (substring == NULL) {
12397 if (PyErr_ExceptionMatches(PyExc_TypeError))
12398 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12399 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012401 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012402 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012404 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405}
12406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012408
12409PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012411\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012412Return a formatted version of S, using substitutions from args and kwargs.\n\
12413The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012414
Eric Smith27bbca62010-11-04 17:06:58 +000012415PyDoc_STRVAR(format_map__doc__,
12416 "S.format_map(mapping) -> str\n\
12417\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012418Return a formatted version of S, using substitutions from mapping.\n\
12419The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012420
Eric Smith4a7d76d2008-05-30 18:10:19 +000012421static PyObject *
12422unicode__format__(PyObject* self, PyObject* args)
12423{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012424 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012425
12426 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12427 return NULL;
12428
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012429 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012431 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012432}
12433
Eric Smith8c663262007-08-25 02:26:07 +000012434PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012436\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012437Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012438
12439static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012440unicode__sizeof__(PyUnicodeObject *v)
12441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 Py_ssize_t size;
12443
12444 /* If it's a compact object, account for base structure +
12445 character data. */
12446 if (PyUnicode_IS_COMPACT_ASCII(v))
12447 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12448 else if (PyUnicode_IS_COMPACT(v))
12449 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012450 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 else {
12452 /* If it is a two-block object, account for base object, and
12453 for character block if present. */
12454 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012455 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012457 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 }
12459 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012460 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012461 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012463 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012464 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465
12466 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012467}
12468
12469PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012471
12472static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012473unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012474{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012475 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 if (!copy)
12477 return NULL;
12478 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012479}
12480
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481static PyMethodDef unicode_methods[] = {
12482
12483 /* Order is according to common usage: often used methods should
12484 appear first, since lookup is done sequentially. */
12485
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012486 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012487 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12488 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012489 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012490 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12491 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12492 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12493 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12494 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12495 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12496 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012497 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012498 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12499 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12500 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012501 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012502 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12503 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12504 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012505 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012507 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012509 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12510 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12511 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12512 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12513 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12514 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12515 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12516 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12517 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12518 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12519 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12520 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12521 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12522 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012523 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012524 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012525 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012526 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012527 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012528 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012529 {"maketrans", (PyCFunction) unicode_maketrans,
12530 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012531 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012532#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012533 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534#endif
12535
12536#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012537 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012538 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539#endif
12540
Benjamin Peterson14339b62009-01-31 16:36:08 +000012541 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 {NULL, NULL}
12543};
12544
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012545static PyObject *
12546unicode_mod(PyObject *v, PyObject *w)
12547{
Brian Curtindfc80e32011-08-10 20:28:54 -050012548 if (!PyUnicode_Check(v))
12549 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012551}
12552
12553static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 0, /*nb_add*/
12555 0, /*nb_subtract*/
12556 0, /*nb_multiply*/
12557 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012558};
12559
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012561 (lenfunc) unicode_length, /* sq_length */
12562 PyUnicode_Concat, /* sq_concat */
12563 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12564 (ssizeargfunc) unicode_getitem, /* sq_item */
12565 0, /* sq_slice */
12566 0, /* sq_ass_item */
12567 0, /* sq_ass_slice */
12568 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569};
12570
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012571static PyObject*
12572unicode_subscript(PyUnicodeObject* self, PyObject* item)
12573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 if (PyUnicode_READY(self) == -1)
12575 return NULL;
12576
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012577 if (PyIndex_Check(item)) {
12578 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012579 if (i == -1 && PyErr_Occurred())
12580 return NULL;
12581 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012583 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012584 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012585 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012586 PyObject *result;
12587 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012588 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012589 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012593 return NULL;
12594 }
12595
12596 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 return PyUnicode_New(0, 0);
12598 } else if (start == 0 && step == 1 &&
12599 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012600 PyUnicode_CheckExact(self)) {
12601 Py_INCREF(self);
12602 return (PyObject *)self;
12603 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012604 return PyUnicode_Substring((PyObject*)self,
12605 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012606 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012607 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012608 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012609 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012610 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012611 src_data = PyUnicode_DATA(self);
12612 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12613 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012614 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012615 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012616 if (max_char >= kind_limit)
12617 break;
12618 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012619 }
12620 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012621 if (result == NULL)
12622 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012623 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012624 dest_data = PyUnicode_DATA(result);
12625
12626 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012627 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12628 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012629 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012630 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012631 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012632 } else {
12633 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12634 return NULL;
12635 }
12636}
12637
12638static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012639 (lenfunc)unicode_length, /* mp_length */
12640 (binaryfunc)unicode_subscript, /* mp_subscript */
12641 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012642};
12643
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645/* Helpers for PyUnicode_Format() */
12646
12647static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012648getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012650 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 (*p_argidx)++;
12653 if (arglen < 0)
12654 return args;
12655 else
12656 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657 }
12658 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 return NULL;
12661}
12662
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012663/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012665static PyObject *
12666formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012668 char *p;
12669 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012671
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 x = PyFloat_AsDouble(v);
12673 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012674 return NULL;
12675
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012678
Eric Smith0923d1d2009-04-16 20:16:10 +000012679 p = PyOS_double_to_string(x, type, prec,
12680 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012681 if (p == NULL)
12682 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012684 PyMem_Free(p);
12685 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686}
12687
Tim Peters38fd5b62000-09-21 05:43:11 +000012688static PyObject*
12689formatlong(PyObject *val, int flags, int prec, int type)
12690{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 char *buf;
12692 int len;
12693 PyObject *str; /* temporary string object. */
12694 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012695
Benjamin Peterson14339b62009-01-31 16:36:08 +000012696 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12697 if (!str)
12698 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012700 Py_DECREF(str);
12701 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012702}
12703
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012704static Py_UCS4
12705formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012707 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012708 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012710 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 goto onError;
12713 }
12714 else {
12715 /* Integer input truncated to a character */
12716 long x;
12717 x = PyLong_AsLong(v);
12718 if (x == -1 && PyErr_Occurred())
12719 goto onError;
12720
12721 if (x < 0 || x > 0x10ffff) {
12722 PyErr_SetString(PyExc_OverflowError,
12723 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012724 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 }
12726
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012727 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012729
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012731 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012733 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734}
12735
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012736static int
12737repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12738{
12739 int r;
12740 assert(count > 0);
12741 assert(PyUnicode_Check(obj));
12742 if (count > 5) {
12743 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12744 if (repeated == NULL)
12745 return -1;
12746 r = _PyAccu_Accumulate(acc, repeated);
12747 Py_DECREF(repeated);
12748 return r;
12749 }
12750 else {
12751 do {
12752 if (_PyAccu_Accumulate(acc, obj))
12753 return -1;
12754 } while (--count);
12755 return 0;
12756 }
12757}
12758
Alexander Belopolsky40018472011-02-26 01:02:56 +000012759PyObject *
12760PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 void *fmt;
12763 int fmtkind;
12764 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012766 int r;
12767 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012770 PyObject *temp = NULL;
12771 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012773 _PyAccu acc;
12774 static PyObject *plus, *minus, *blank, *zero, *percent;
12775
12776 if (!plus && !(plus = get_latin1_char('+')))
12777 return NULL;
12778 if (!minus && !(minus = get_latin1_char('-')))
12779 return NULL;
12780 if (!blank && !(blank = get_latin1_char(' ')))
12781 return NULL;
12782 if (!zero && !(zero = get_latin1_char('0')))
12783 return NULL;
12784 if (!percent && !(percent = get_latin1_char('%')))
12785 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012786
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 PyErr_BadInternalCall();
12789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12792 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012794 if (_PyAccu_Init(&acc))
12795 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 fmt = PyUnicode_DATA(uformat);
12797 fmtkind = PyUnicode_KIND(uformat);
12798 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12799 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 arglen = PyTuple_Size(args);
12803 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804 }
12805 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 arglen = -1;
12807 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012809 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012810 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
12813 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012815 PyObject *nonfmt;
12816 Py_ssize_t nonfmtpos;
12817 nonfmtpos = fmtpos++;
12818 while (fmtcnt >= 0 &&
12819 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12820 fmtpos++;
12821 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012823 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12824 if (nonfmt == NULL)
12825 goto onError;
12826 r = _PyAccu_Accumulate(&acc, nonfmt);
12827 Py_DECREF(nonfmt);
12828 if (r)
12829 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 }
12831 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 /* Got a format specifier */
12833 int flags = 0;
12834 Py_ssize_t width = -1;
12835 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012837 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 int isnumok;
12839 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012840 void *pbuf = NULL;
12841 Py_ssize_t pindex, len;
12842 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 fmtpos++;
12845 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12846 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 Py_ssize_t keylen;
12848 PyObject *key;
12849 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012850
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 if (dict == NULL) {
12852 PyErr_SetString(PyExc_TypeError,
12853 "format requires a mapping");
12854 goto onError;
12855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 /* Skip over balanced parentheses */
12860 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 if (fmtcnt < 0 || pcount > 0) {
12869 PyErr_SetString(PyExc_ValueError,
12870 "incomplete format key");
12871 goto onError;
12872 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012873 key = PyUnicode_Substring((PyObject*)uformat,
12874 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 if (key == NULL)
12876 goto onError;
12877 if (args_owned) {
12878 Py_DECREF(args);
12879 args_owned = 0;
12880 }
12881 args = PyObject_GetItem(dict, key);
12882 Py_DECREF(key);
12883 if (args == NULL) {
12884 goto onError;
12885 }
12886 args_owned = 1;
12887 arglen = -1;
12888 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012889 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 case '-': flags |= F_LJUST; continue;
12893 case '+': flags |= F_SIGN; continue;
12894 case ' ': flags |= F_BLANK; continue;
12895 case '#': flags |= F_ALT; continue;
12896 case '0': flags |= F_ZERO; continue;
12897 }
12898 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 if (c == '*') {
12901 v = getnextarg(args, arglen, &argidx);
12902 if (v == NULL)
12903 goto onError;
12904 if (!PyLong_Check(v)) {
12905 PyErr_SetString(PyExc_TypeError,
12906 "* wants int");
12907 goto onError;
12908 }
12909 width = PyLong_AsLong(v);
12910 if (width == -1 && PyErr_Occurred())
12911 goto onError;
12912 if (width < 0) {
12913 flags |= F_LJUST;
12914 width = -width;
12915 }
12916 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012918 }
12919 else if (c >= '0' && c <= '9') {
12920 width = c - '0';
12921 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 if (c < '0' || c > '9')
12924 break;
12925 if ((width*10) / 10 != width) {
12926 PyErr_SetString(PyExc_ValueError,
12927 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 }
12930 width = width*10 + (c - '0');
12931 }
12932 }
12933 if (c == '.') {
12934 prec = 0;
12935 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 if (c == '*') {
12938 v = getnextarg(args, arglen, &argidx);
12939 if (v == NULL)
12940 goto onError;
12941 if (!PyLong_Check(v)) {
12942 PyErr_SetString(PyExc_TypeError,
12943 "* wants int");
12944 goto onError;
12945 }
12946 prec = PyLong_AsLong(v);
12947 if (prec == -1 && PyErr_Occurred())
12948 goto onError;
12949 if (prec < 0)
12950 prec = 0;
12951 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012953 }
12954 else if (c >= '0' && c <= '9') {
12955 prec = c - '0';
12956 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012958 if (c < '0' || c > '9')
12959 break;
12960 if ((prec*10) / 10 != prec) {
12961 PyErr_SetString(PyExc_ValueError,
12962 "prec too big");
12963 goto onError;
12964 }
12965 prec = prec*10 + (c - '0');
12966 }
12967 }
12968 } /* prec */
12969 if (fmtcnt >= 0) {
12970 if (c == 'h' || c == 'l' || c == 'L') {
12971 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 }
12974 }
12975 if (fmtcnt < 0) {
12976 PyErr_SetString(PyExc_ValueError,
12977 "incomplete format");
12978 goto onError;
12979 }
12980 if (c != '%') {
12981 v = getnextarg(args, arglen, &argidx);
12982 if (v == NULL)
12983 goto onError;
12984 }
12985 sign = 0;
12986 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012987 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 switch (c) {
12989
12990 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012991 _PyAccu_Accumulate(&acc, percent);
12992 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000012993
12994 case 's':
12995 case 'r':
12996 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012997 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 temp = v;
12999 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 }
13001 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 if (c == 's')
13003 temp = PyObject_Str(v);
13004 else if (c == 'r')
13005 temp = PyObject_Repr(v);
13006 else
13007 temp = PyObject_ASCII(v);
13008 if (temp == NULL)
13009 goto onError;
13010 if (PyUnicode_Check(temp))
13011 /* nothing to do */;
13012 else {
13013 Py_DECREF(temp);
13014 PyErr_SetString(PyExc_TypeError,
13015 "%s argument has non-string str()");
13016 goto onError;
13017 }
13018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 if (PyUnicode_READY(temp) == -1) {
13020 Py_CLEAR(temp);
13021 goto onError;
13022 }
13023 pbuf = PyUnicode_DATA(temp);
13024 kind = PyUnicode_KIND(temp);
13025 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 if (prec >= 0 && len > prec)
13027 len = prec;
13028 break;
13029
13030 case 'i':
13031 case 'd':
13032 case 'u':
13033 case 'o':
13034 case 'x':
13035 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013036 isnumok = 0;
13037 if (PyNumber_Check(v)) {
13038 PyObject *iobj=NULL;
13039
13040 if (PyLong_Check(v)) {
13041 iobj = v;
13042 Py_INCREF(iobj);
13043 }
13044 else {
13045 iobj = PyNumber_Long(v);
13046 }
13047 if (iobj!=NULL) {
13048 if (PyLong_Check(iobj)) {
13049 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013050 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 Py_DECREF(iobj);
13052 if (!temp)
13053 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 if (PyUnicode_READY(temp) == -1) {
13055 Py_CLEAR(temp);
13056 goto onError;
13057 }
13058 pbuf = PyUnicode_DATA(temp);
13059 kind = PyUnicode_KIND(temp);
13060 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 sign = 1;
13062 }
13063 else {
13064 Py_DECREF(iobj);
13065 }
13066 }
13067 }
13068 if (!isnumok) {
13069 PyErr_Format(PyExc_TypeError,
13070 "%%%c format: a number is required, "
13071 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13072 goto onError;
13073 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013074 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013076 fillobj = zero;
13077 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 break;
13079
13080 case 'e':
13081 case 'E':
13082 case 'f':
13083 case 'F':
13084 case 'g':
13085 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013086 temp = formatfloat(v, flags, prec, c);
13087 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 if (PyUnicode_READY(temp) == -1) {
13090 Py_CLEAR(temp);
13091 goto onError;
13092 }
13093 pbuf = PyUnicode_DATA(temp);
13094 kind = PyUnicode_KIND(temp);
13095 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013097 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013099 fillobj = zero;
13100 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 break;
13102
13103 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013104 {
13105 Py_UCS4 ch = formatchar(v);
13106 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013108 temp = _PyUnicode_FromUCS4(&ch, 1);
13109 if (temp == NULL)
13110 goto onError;
13111 pbuf = PyUnicode_DATA(temp);
13112 kind = PyUnicode_KIND(temp);
13113 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013115 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013116
13117 default:
13118 PyErr_Format(PyExc_ValueError,
13119 "unsupported format character '%c' (0x%x) "
13120 "at index %zd",
13121 (31<=c && c<=126) ? (char)c : '?',
13122 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 goto onError;
13125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 /* pbuf is initialized here. */
13127 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013129 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13130 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013132 pindex++;
13133 }
13134 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13135 signobj = plus;
13136 len--;
13137 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 }
13139 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013140 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013142 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 else
13144 sign = 0;
13145 }
13146 if (width < len)
13147 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013149 if (fill != ' ') {
13150 assert(signobj != NULL);
13151 if (_PyAccu_Accumulate(&acc, signobj))
13152 goto onError;
13153 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 if (width > len)
13155 width--;
13156 }
13157 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013159 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013161 second = get_latin1_char(
13162 PyUnicode_READ(kind, pbuf, pindex + 1));
13163 pindex += 2;
13164 if (second == NULL ||
13165 _PyAccu_Accumulate(&acc, zero) ||
13166 _PyAccu_Accumulate(&acc, second))
13167 goto onError;
13168 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 width -= 2;
13171 if (width < 0)
13172 width = 0;
13173 len -= 2;
13174 }
13175 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013177 if (repeat_accumulate(&acc, fillobj, width - len))
13178 goto onError;
13179 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 }
13181 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013182 if (sign) {
13183 assert(signobj != NULL);
13184 if (_PyAccu_Accumulate(&acc, signobj))
13185 goto onError;
13186 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13189 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013190 second = get_latin1_char(
13191 PyUnicode_READ(kind, pbuf, pindex + 1));
13192 pindex += 2;
13193 if (second == NULL ||
13194 _PyAccu_Accumulate(&acc, zero) ||
13195 _PyAccu_Accumulate(&acc, second))
13196 goto onError;
13197 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 }
13199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013201 if (temp != NULL) {
13202 assert(pbuf == PyUnicode_DATA(temp));
13203 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013205 else {
13206 const char *p = (const char *) pbuf;
13207 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013208 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013209 v = PyUnicode_FromKindAndData(kind, p, len);
13210 }
13211 if (v == NULL)
13212 goto onError;
13213 r = _PyAccu_Accumulate(&acc, v);
13214 Py_DECREF(v);
13215 if (r)
13216 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013217 if (width > len && repeat_accumulate(&acc, blank, width - len))
13218 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 if (dict && (argidx < arglen) && c != '%') {
13220 PyErr_SetString(PyExc_TypeError,
13221 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 goto onError;
13223 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013224 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226 } /* until end */
13227 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 PyErr_SetString(PyExc_TypeError,
13229 "not all arguments converted during string formatting");
13230 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231 }
13232
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013233 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236 }
13237 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013238 Py_XDECREF(temp);
13239 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240 return (PyObject *)result;
13241
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013244 Py_XDECREF(temp);
13245 Py_XDECREF(second);
13246 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249 }
13250 return NULL;
13251}
13252
Jeremy Hylton938ace62002-07-17 16:30:39 +000013253static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013254unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13255
Tim Peters6d6c1a32001-08-02 04:15:00 +000013256static PyObject *
13257unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13258{
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013260 static char *kwlist[] = {"object", "encoding", "errors", 0};
13261 char *encoding = NULL;
13262 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013263
Benjamin Peterson14339b62009-01-31 16:36:08 +000013264 if (type != &PyUnicode_Type)
13265 return unicode_subtype_new(type, args, kwds);
13266 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 return NULL;
13269 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 if (encoding == NULL && errors == NULL)
13272 return PyObject_Str(x);
13273 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013275}
13276
Guido van Rossume023fe02001-08-30 03:12:59 +000013277static PyObject *
13278unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13279{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013280 PyUnicodeObject *unicode, *self;
13281 Py_ssize_t length, char_size;
13282 int share_wstr, share_utf8;
13283 unsigned int kind;
13284 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013285
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013287
13288 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13289 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013290 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013291 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013292 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013293 return NULL;
13294
13295 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13296 if (self == NULL) {
13297 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 return NULL;
13299 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013300 kind = PyUnicode_KIND(unicode);
13301 length = PyUnicode_GET_LENGTH(unicode);
13302
13303 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013304#ifdef Py_DEBUG
13305 _PyUnicode_HASH(self) = -1;
13306#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013307 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013308#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013309 _PyUnicode_STATE(self).interned = 0;
13310 _PyUnicode_STATE(self).kind = kind;
13311 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013312 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013313 _PyUnicode_STATE(self).ready = 1;
13314 _PyUnicode_WSTR(self) = NULL;
13315 _PyUnicode_UTF8_LENGTH(self) = 0;
13316 _PyUnicode_UTF8(self) = NULL;
13317 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013318 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013319
13320 share_utf8 = 0;
13321 share_wstr = 0;
13322 if (kind == PyUnicode_1BYTE_KIND) {
13323 char_size = 1;
13324 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13325 share_utf8 = 1;
13326 }
13327 else if (kind == PyUnicode_2BYTE_KIND) {
13328 char_size = 2;
13329 if (sizeof(wchar_t) == 2)
13330 share_wstr = 1;
13331 }
13332 else {
13333 assert(kind == PyUnicode_4BYTE_KIND);
13334 char_size = 4;
13335 if (sizeof(wchar_t) == 4)
13336 share_wstr = 1;
13337 }
13338
13339 /* Ensure we won't overflow the length. */
13340 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13341 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013343 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013344 data = PyObject_MALLOC((length + 1) * char_size);
13345 if (data == NULL) {
13346 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 goto onError;
13348 }
13349
Victor Stinnerc3c74152011-10-02 20:39:55 +020013350 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013351 if (share_utf8) {
13352 _PyUnicode_UTF8_LENGTH(self) = length;
13353 _PyUnicode_UTF8(self) = data;
13354 }
13355 if (share_wstr) {
13356 _PyUnicode_WSTR_LENGTH(self) = length;
13357 _PyUnicode_WSTR(self) = (wchar_t *)data;
13358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013360 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013361 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013362 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013363 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013364#ifdef Py_DEBUG
13365 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13366#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013367 return (PyObject *)self;
13368
13369onError:
13370 Py_DECREF(unicode);
13371 Py_DECREF(self);
13372 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013373}
13374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013375PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013377\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013378Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013379encoding defaults to the current default string encoding.\n\
13380errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013381
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013382static PyObject *unicode_iter(PyObject *seq);
13383
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013385 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013386 "str", /* tp_name */
13387 sizeof(PyUnicodeObject), /* tp_size */
13388 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013390 (destructor)unicode_dealloc, /* tp_dealloc */
13391 0, /* tp_print */
13392 0, /* tp_getattr */
13393 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013394 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013395 unicode_repr, /* tp_repr */
13396 &unicode_as_number, /* tp_as_number */
13397 &unicode_as_sequence, /* tp_as_sequence */
13398 &unicode_as_mapping, /* tp_as_mapping */
13399 (hashfunc) unicode_hash, /* tp_hash*/
13400 0, /* tp_call*/
13401 (reprfunc) unicode_str, /* tp_str */
13402 PyObject_GenericGetAttr, /* tp_getattro */
13403 0, /* tp_setattro */
13404 0, /* tp_as_buffer */
13405 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013407 unicode_doc, /* tp_doc */
13408 0, /* tp_traverse */
13409 0, /* tp_clear */
13410 PyUnicode_RichCompare, /* tp_richcompare */
13411 0, /* tp_weaklistoffset */
13412 unicode_iter, /* tp_iter */
13413 0, /* tp_iternext */
13414 unicode_methods, /* tp_methods */
13415 0, /* tp_members */
13416 0, /* tp_getset */
13417 &PyBaseObject_Type, /* tp_base */
13418 0, /* tp_dict */
13419 0, /* tp_descr_get */
13420 0, /* tp_descr_set */
13421 0, /* tp_dictoffset */
13422 0, /* tp_init */
13423 0, /* tp_alloc */
13424 unicode_new, /* tp_new */
13425 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426};
13427
13428/* Initialize the Unicode implementation */
13429
Thomas Wouters78890102000-07-22 19:25:51 +000013430void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013432 int i;
13433
Thomas Wouters477c8d52006-05-27 19:21:47 +000013434 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013436 0x000A, /* LINE FEED */
13437 0x000D, /* CARRIAGE RETURN */
13438 0x001C, /* FILE SEPARATOR */
13439 0x001D, /* GROUP SEPARATOR */
13440 0x001E, /* RECORD SEPARATOR */
13441 0x0085, /* NEXT LINE */
13442 0x2028, /* LINE SEPARATOR */
13443 0x2029, /* PARAGRAPH SEPARATOR */
13444 };
13445
Fred Drakee4315f52000-05-09 19:53:39 +000013446 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013447 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013448 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013449 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013452 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013454 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013456
13457 /* initialize the linebreak bloom filter */
13458 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013459 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013460 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013461
13462 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463}
13464
13465/* Finalize the Unicode implementation */
13466
Christian Heimesa156e092008-02-16 07:38:31 +000013467int
13468PyUnicode_ClearFreeList(void)
13469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013470 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013471}
13472
Guido van Rossumd57fd912000-03-10 22:53:23 +000013473void
Thomas Wouters78890102000-07-22 19:25:51 +000013474_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013476 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013477
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013478 Py_XDECREF(unicode_empty);
13479 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013480
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013481 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 if (unicode_latin1[i]) {
13483 Py_DECREF(unicode_latin1[i]);
13484 unicode_latin1[i] = NULL;
13485 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013486 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013487 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013488 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013489}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013490
Walter Dörwald16807132007-05-25 13:52:07 +000013491void
13492PyUnicode_InternInPlace(PyObject **p)
13493{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013494 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13495 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013496#ifdef Py_DEBUG
13497 assert(s != NULL);
13498 assert(_PyUnicode_CHECK(s));
13499#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013500 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013501 return;
13502#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013503 /* If it's a subclass, we don't really know what putting
13504 it in the interned dict might do. */
13505 if (!PyUnicode_CheckExact(s))
13506 return;
13507 if (PyUnicode_CHECK_INTERNED(s))
13508 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013509 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013510 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013511 return;
13512 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013513 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013514 if (interned == NULL) {
13515 interned = PyDict_New();
13516 if (interned == NULL) {
13517 PyErr_Clear(); /* Don't leave an exception */
13518 return;
13519 }
13520 }
13521 /* It might be that the GetItem call fails even
13522 though the key is present in the dictionary,
13523 namely when this happens during a stack overflow. */
13524 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013526 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013527
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 if (t) {
13529 Py_INCREF(t);
13530 Py_DECREF(*p);
13531 *p = t;
13532 return;
13533 }
Walter Dörwald16807132007-05-25 13:52:07 +000013534
Benjamin Peterson14339b62009-01-31 16:36:08 +000013535 PyThreadState_GET()->recursion_critical = 1;
13536 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13537 PyErr_Clear();
13538 PyThreadState_GET()->recursion_critical = 0;
13539 return;
13540 }
13541 PyThreadState_GET()->recursion_critical = 0;
13542 /* The two references in interned are not counted by refcnt.
13543 The deallocator will take care of this */
13544 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013546}
13547
13548void
13549PyUnicode_InternImmortal(PyObject **p)
13550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013551 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13552
Benjamin Peterson14339b62009-01-31 16:36:08 +000013553 PyUnicode_InternInPlace(p);
13554 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 Py_INCREF(*p);
13557 }
Walter Dörwald16807132007-05-25 13:52:07 +000013558}
13559
13560PyObject *
13561PyUnicode_InternFromString(const char *cp)
13562{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 PyObject *s = PyUnicode_FromString(cp);
13564 if (s == NULL)
13565 return NULL;
13566 PyUnicode_InternInPlace(&s);
13567 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013568}
13569
Alexander Belopolsky40018472011-02-26 01:02:56 +000013570void
13571_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013572{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013573 PyObject *keys;
13574 PyUnicodeObject *s;
13575 Py_ssize_t i, n;
13576 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013577
Benjamin Peterson14339b62009-01-31 16:36:08 +000013578 if (interned == NULL || !PyDict_Check(interned))
13579 return;
13580 keys = PyDict_Keys(interned);
13581 if (keys == NULL || !PyList_Check(keys)) {
13582 PyErr_Clear();
13583 return;
13584 }
Walter Dörwald16807132007-05-25 13:52:07 +000013585
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13587 detector, interned unicode strings are not forcibly deallocated;
13588 rather, we give them their stolen references back, and then clear
13589 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013590
Benjamin Peterson14339b62009-01-31 16:36:08 +000013591 n = PyList_GET_SIZE(keys);
13592 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013594 for (i = 0; i < n; i++) {
13595 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013596 if (PyUnicode_READY(s) == -1) {
13597 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 case SSTATE_NOT_INTERNED:
13602 /* XXX Shouldn't happen */
13603 break;
13604 case SSTATE_INTERNED_IMMORTAL:
13605 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013607 break;
13608 case SSTATE_INTERNED_MORTAL:
13609 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013611 break;
13612 default:
13613 Py_FatalError("Inconsistent interned string state.");
13614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013616 }
13617 fprintf(stderr, "total size of all interned strings: "
13618 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13619 "mortal/immortal\n", mortal_size, immortal_size);
13620 Py_DECREF(keys);
13621 PyDict_Clear(interned);
13622 Py_DECREF(interned);
13623 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013624}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013625
13626
13627/********************* Unicode Iterator **************************/
13628
13629typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013630 PyObject_HEAD
13631 Py_ssize_t it_index;
13632 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013633} unicodeiterobject;
13634
13635static void
13636unicodeiter_dealloc(unicodeiterobject *it)
13637{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013638 _PyObject_GC_UNTRACK(it);
13639 Py_XDECREF(it->it_seq);
13640 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013641}
13642
13643static int
13644unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013646 Py_VISIT(it->it_seq);
13647 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013648}
13649
13650static PyObject *
13651unicodeiter_next(unicodeiterobject *it)
13652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013653 PyUnicodeObject *seq;
13654 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013655
Benjamin Peterson14339b62009-01-31 16:36:08 +000013656 assert(it != NULL);
13657 seq = it->it_seq;
13658 if (seq == NULL)
13659 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013660 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13663 int kind = PyUnicode_KIND(seq);
13664 void *data = PyUnicode_DATA(seq);
13665 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13666 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013667 if (item != NULL)
13668 ++it->it_index;
13669 return item;
13670 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013671
Benjamin Peterson14339b62009-01-31 16:36:08 +000013672 Py_DECREF(seq);
13673 it->it_seq = NULL;
13674 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013675}
13676
13677static PyObject *
13678unicodeiter_len(unicodeiterobject *it)
13679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 Py_ssize_t len = 0;
13681 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013682 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013683 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013684}
13685
13686PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13687
13688static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013689 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013692};
13693
13694PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13696 "str_iterator", /* tp_name */
13697 sizeof(unicodeiterobject), /* tp_basicsize */
13698 0, /* tp_itemsize */
13699 /* methods */
13700 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13701 0, /* tp_print */
13702 0, /* tp_getattr */
13703 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013704 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013705 0, /* tp_repr */
13706 0, /* tp_as_number */
13707 0, /* tp_as_sequence */
13708 0, /* tp_as_mapping */
13709 0, /* tp_hash */
13710 0, /* tp_call */
13711 0, /* tp_str */
13712 PyObject_GenericGetAttr, /* tp_getattro */
13713 0, /* tp_setattro */
13714 0, /* tp_as_buffer */
13715 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13716 0, /* tp_doc */
13717 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13718 0, /* tp_clear */
13719 0, /* tp_richcompare */
13720 0, /* tp_weaklistoffset */
13721 PyObject_SelfIter, /* tp_iter */
13722 (iternextfunc)unicodeiter_next, /* tp_iternext */
13723 unicodeiter_methods, /* tp_methods */
13724 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013725};
13726
13727static PyObject *
13728unicode_iter(PyObject *seq)
13729{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013731
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 if (!PyUnicode_Check(seq)) {
13733 PyErr_BadInternalCall();
13734 return NULL;
13735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013736 if (PyUnicode_READY(seq) == -1)
13737 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013738 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13739 if (it == NULL)
13740 return NULL;
13741 it->it_index = 0;
13742 Py_INCREF(seq);
13743 it->it_seq = (PyUnicodeObject *)seq;
13744 _PyObject_GC_TRACK(it);
13745 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013746}
13747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748#define UNIOP(x) Py_UNICODE_##x
13749#define UNIOP_t Py_UNICODE
13750#include "uniops.h"
13751#undef UNIOP
13752#undef UNIOP_t
13753#define UNIOP(x) Py_UCS4_##x
13754#define UNIOP_t Py_UCS4
13755#include "uniops.h"
13756#undef UNIOP
13757#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013758
Victor Stinner71133ff2010-09-01 23:43:53 +000013759Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013760PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013761{
13762 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020013763 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000013764 Py_ssize_t size;
13765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013766 if (!PyUnicode_Check(unicode)) {
13767 PyErr_BadArgument();
13768 return NULL;
13769 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013770 u = PyUnicode_AsUnicode(object);
13771 if (u == NULL)
13772 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000013773 /* Ensure we won't overflow the size. */
13774 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13775 PyErr_NoMemory();
13776 return NULL;
13777 }
13778 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13779 size *= sizeof(Py_UNICODE);
13780 copy = PyMem_Malloc(size);
13781 if (copy == NULL) {
13782 PyErr_NoMemory();
13783 return NULL;
13784 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013785 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000013786 return copy;
13787}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013788
Georg Brandl66c221e2010-10-14 07:04:07 +000013789/* A _string module, to export formatter_parser and formatter_field_name_split
13790 to the string.Formatter class implemented in Python. */
13791
13792static PyMethodDef _string_methods[] = {
13793 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13794 METH_O, PyDoc_STR("split the argument as a field name")},
13795 {"formatter_parser", (PyCFunction) formatter_parser,
13796 METH_O, PyDoc_STR("parse the argument as a format string")},
13797 {NULL, NULL}
13798};
13799
13800static struct PyModuleDef _string_module = {
13801 PyModuleDef_HEAD_INIT,
13802 "_string",
13803 PyDoc_STR("string helper module"),
13804 0,
13805 _string_methods,
13806 NULL,
13807 NULL,
13808 NULL,
13809 NULL
13810};
13811
13812PyMODINIT_FUNC
13813PyInit__string(void)
13814{
13815 return PyModule_Create(&_string_module);
13816}
13817
13818
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013819#ifdef __cplusplus
13820}
13821#endif